In [51]:
# imports
import requests
import json
import csv
import time
import datetime
from string import Template
from Config import *
import sqlite3
from sqlite3 import Error

In [52]:
# Variables
headers = {"Authorization": API_TOKEN}
min_stars = 0
max_stars = 1000
last_activity = 90 # within the last __ days
created = 364 * 4 # within the last __ days
min_pull_num = 0 # amount of pull requests a repository needs
watchers = 0 # amount of watchers a repository needs

In [53]:
# Builds the query filter string compatible to github
def query_filter( min_stars, max_stars, last_activity, created ):
    date_last_act = datetime.datetime.now() - datetime.timedelta( days=last_activity )
    date_created = datetime.datetime.now() - datetime.timedelta( days=created )
    stars = f'{min_stars}..{max_stars}'

    return f'is:public archived:false fork:false stars:{stars} pushed:20{date_last_act:%y-%m-%d}..* created:20{date_created:%y-%m-%d}'

In [54]:
# Funtion that uses requests.post to make the API call
def run_query(query, variables):
    request = requests.post('https://api.github.com/graphql', 
                            json={'query': query, 'variables' : variables}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception(f'ERROR [{request.status_code}]: Query failed to execute...\nRESPONSE: {request.text}')

In [55]:
# setups the github graphql queries
def setup_query( end_cursor ) :
    query_string = query_filter( min_stars, max_stars, last_activity, created )
    variables = {
        "query_string" : query_filter( min_stars, max_stars, last_activity, created),
        "end_cursor" : end_cursor
    }
    query = '''
    query($query_string : String!) {
        rateLimit{
            cost
            remaining
            resetAt
        }
        search(query: $query_string, type: REPOSITORY, first:50) {
        pageInfo {
            hasNextPage
            endCursor
        }
            repositoryCount
            nodes {
                ... on Repository {
                    id
                    name
                    owner {
                        login
                    }
                    createdAt
                    isMirror
                    isFork
                    diskUsage
                    primaryLanguage {
                        name
                    }
                    languages(first:100) {
                        totalCount
                        nodes {
                            name
                        }
                    }
                    contributors : mentionableUsers {
                        totalCount
                    }
                    watchers {
                        totalCount
                    }
                    stargazers {
                        totalCount
                    }
                    forkCount
                    issues {
                        totalCount
                    }
                    commits : defaultBranchRef {
                        target {
                            ... on Commit {
                                history {
                                    totalCount
                                }
                            }
                        }
                    }
                    pullRequests {
                        totalCount
                    }
                    releases {
                        totalCount
                    }
                    licenseInfo {
                        id
                        name
                        pseudoLicense
                    }
                    url
                }
            }
        }
    }'''
    query2 = '''
    query($query_string : String!, $end_cursor : String!) {
        rateLimit{
            cost
            remaining
            resetAt
        }
        search(query: $query_string, type: REPOSITORY, first:50, after: $end_cursor) {
        pageInfo {
            hasNextPage
            endCursor
        }
            repositoryCount
            nodes {
                ... on Repository {
                    owner {
                        login
                    }
                    id
                    name
                    description
                    createdAt
                    pushedAt
                    isDisabled
                    isMirror
                    isFork
                    isLocked
                    diskUsage
                    primaryLanguage {
                        name
                    }
                    languages(first:100) {
                        totalCount
                        nodes {
                            name
                        }
                    }
                    contributors : mentionableUsers {
                        totalCount
                    }
                    watchers {
                        totalCount
                    }
                    stargazers {
                        totalCount
                    }
                    forkCount
                    issues {
                        totalCount
                    }
                    commits : defaultBranchRef {
                        target {
                            ... on Commit {
                                history {
                                    totalCount
                                }
                            }
                        }
                    }
                    pullRequests {
                        totalCount
                    }
                    releases {
                        totalCount
                    }
                    licenseInfo {
                        id
                        name
                        pseudoLicense
                    }
                    url
                }
            }
        }
    }'''
    if(end_cursor == "") : return (query, variables)
    else : return (query2, variables)

In [56]:
# Runs the query and iterates through all pages of repositories
def iterate_queries( create_dict_method ):

    end_cursor = ""
    end_cursor_string = ""
    hasNextPage = True
    index = 0
    data_list = []
    
    print("[WORKING] Running script to collect all repositories. ")
    while( hasNextPage ):
        query = setup_query( end_cursor_string )
        result = run_query( query[0], query[1] )
        print(json.dumps(result))
        
        try: 
            result["errors"]
            break
        except KeyError:
            print('[SUCCESS] NO ERROR')
        
        nodes = result["data"]["search"]["nodes"]
        for node in nodes:
            data_list.append( create_dict_method( node ) )

        try:
            # if there is a next page, update the endcursor string and continue loop
            if( result["data"]["search"]["pageInfo"]["hasNextPage"] ):
                end_cursor = result["data"]["search"]["pageInfo"]["endCursor"]
                end_cursor_string = f', after:"{end_cursor}"'
            else:
                hasNextPage = False
        except KeyError:
            print(json.dumps(result))
            break

        index += 1
        time.sleep(1)
    return data_list

In [57]:
# creates a single repository tuple instance to be added to a sqlite database
def create_repo_tuple( node ) :
    primaryLanguage = "null"
    if(node["primaryLanguage"] != None) :
        primaryLanguage = node["primaryLanguage"]["name"]
        
    license_id = "0"
    license_name = "null"
    pseudoLicense = False
    if(node["licenseInfo"] != None) :
        license_id = node["licenseInfo"]["id"]
        license_name = node["licenseInfo"]["name"]
        pseudoLicense = node["licenseInfo"]["pseudoLicense"]
        
    return (
    node["id"], 
    node["name"], 
    node["owner"]["login"], 
    node["createdAt"], 
    node["isMirror"], 
    node["isFork"], 
    node["diskUsage"], 
    primaryLanguage,
    node["contributors"]["totalCount"], 
    node["watchers"]["totalCount"], 
    node["stargazers"]["totalCount"], 
    node["forkCount"], 
    node["issues"]["totalCount"], 
    node["commits"]["target"]["history"]["totalCount"], 
    node["pullRequests"]["totalCount"], 
    node["releases"]["totalCount"],
    license_id, 
    license_name, 
    pseudoLicense, 
    node["url"]
    )

In [58]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
        
    return conn

In [59]:
def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

In [60]:
# insert repository tuples into the sqlite database
def insert_repositories(conn, repository):
    """
    Create a new repository into the repository table
    :param conn:
    :param repository:
    :return: project id
    """
    sql = """INSERT INTO repositories (id, name, owner, start_date, isMirror, isFork, diskUsage, 
                     primaryLanguage, numContributors, watchers, stars, forks, issues, commits, 
                     pullRequests, releases, license_id, license_name, pseudoLicense, url) VALUES
                     (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
    cur = conn.cursor()
    cur.executemany(sql, repository)
    return cur.lastrowid

In [61]:
# name of the sqlite file
database = r"C:\sqlite\db\research.db"

# create a database connection
conn = create_connection(database)

In [62]:
# sql for creating the repos table
sql_create_repos_table = """CREATE TABLE IF NOT EXISTS repositories (
                                        id text PRIMARY KEY NOT NULL,
                                        name text NOT NULL,
                                        owner text NOT NULL,
                                        start_date text NOT NULL,
                                        isMirror boolean NOT NULL,
                                        isFork boolean NOT NULL,
                                        diskUsage int NOT NULL,
                                        primaryLanguage text NOT NULL,
                                        numContributors int NOT NULL,
                                        watchers int NOT NULL,
                                        stars int NOT NULL,
                                        forks int NOT NULL,
                                        issues int NOT NULL,
                                        commits int NOT NULL,
                                        pullRequests int NOT NULL,
                                        releases int NOT NULL,
                                        license_id text NOT NULL,
                                        license_name text NOT NULL,
                                        pseudoLicense boolean NOT NULL,
                                        url text NOT NULL
                                    ); """

with conn:
    create_table(conn, sql_create_repos_table)

In [63]:
# when conn is valid
with conn:
    # run query to find all repositories
    # pass in the repository tuple builder
    # insert the tuple into the database
    insert_repositories(conn, iterate_queries( create_repo_tuple ) )

[WORKING] Running script to collect all repositories. 
{"data": {"rateLimit": {"cost": 1, "remaining": 4993, "resetAt": "2020-02-11T08:59:47Z"}, "search": {"pageInfo": {"hasNextPage": true, "endCursor": "Y3Vyc29yOjUw"}, "repositoryCount": 281, "nodes": [{"id": "MDEwOlJlcG9zaXRvcnk1MTc1NzQ1OA==", "name": "zapret", "owner": {"login": "bol-van"}, "createdAt": "2016-02-15T13:30:50Z", "isMirror": false, "isFork": false, "diskUsage": 10107, "primaryLanguage": {"name": "C"}, "languages": {"totalCount": 5, "nodes": [{"name": "Makefile"}, {"name": "Shell"}, {"name": "C"}, {"name": "C++"}, {"name": "Objective-C"}]}, "contributors": {"totalCount": 3}, "watchers": {"totalCount": 83}, "stargazers": {"totalCount": 915}, "forkCount": 139, "issues": {"totalCount": 43}, "commits": {"target": {"history": {"totalCount": 50}}}, "pullRequests": {"totalCount": 4}, "releases": {"totalCount": 0}, "licenseInfo": null, "url": "https://github.com/bol-van/zapret"}, {"id": "MDEwOlJlcG9zaXRvcnk1MTc0Mzg4NQ==", "name

{"errors": [{"type": "INVALID_CURSOR_ARGUMENTS", "path": ["search", "pageInfo", "hasNextPage"], "locations": [{"line": 10, "column": 13}], "message": "`, after:\"Y3Vyc29yOjUw\"` does not appear to be a valid cursor."}]}


IntegrityError: UNIQUE constraint failed: repositories.id