In [66]:
# imports
import requests
import json
import csv
import time
import datetime
from string import Template
from Config import *
import sqlite3
from sqlite3 import Error

In [67]:
# Variables
headers = {"Authorization": API_TOKEN}
min_stars = 100
max_stars = 10000
last_activity = 90 # within the last __ days
created = 364 * 4 # within the last __ days
min_pull_num = 0 # amount of pull requests a repository needs
watchers = 0 # amount of watchers a repository needs

In [68]:
# Builds the query filter string compatible to github
def query_filter( min_stars, max_stars, last_activity, created ):
    date_last_act = datetime.datetime.now() - datetime.timedelta( days=last_activity )
    date_created = datetime.datetime.now() - datetime.timedelta( days=created )
    stars = f'{min_stars}..{max_stars}'

    return f'is:public archived:false fork:false stars:{stars} pushed:20{date_last_act:%y-%m-%d}..* created:20{date_created:%y-%m-%d}..*'

In [69]:
# Funtion that uses requests.post to make the API call
def run_query(query, variables):
    request = requests.post('https://api.github.com/graphql', 
                            json={'query': query, 'variables' : variables}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception(f'ERROR [{request.status_code}]: Query failed to execute...\nRESPONSE: {request.text}')

In [70]:
# setups the github graphql queries
def setup_query( end_cursor ) :
    query_string = query_filter( min_stars, max_stars, last_activity, created )
    variables = {
        "query_string" : query_filter( min_stars, max_stars, last_activity, created),
        "end_cursor" : end_cursor
    }
    query = '''
    query($query_string : String!) {
        rateLimit{
            cost
            remaining
            resetAt
        }
        search(query: $query_string, type: REPOSITORY, first:50) {
            pageInfo {
                hasNextPage
                endCursor
            }
            repositoryCount
            nodes {
                ... on Repository {
                    id
                    name
                    owner {
                        login
                    }
                    createdAt
                    isMirror
                    isFork
                    diskUsage
                    primaryLanguage {
                        name
                    }
                    languages(first:100) {
                        totalCount
                        nodes {
                            name
                        }
                    }
                    contributors : mentionableUsers {
                        totalCount
                    }
                    watchers {
                        totalCount
                    }
                    stargazers {
                        totalCount
                    }
                    forkCount
                    issues {
                        totalCount
                    }
                    commits : defaultBranchRef {
                        target {
                            ... on Commit {
                                history {
                                    totalCount
                                }
                            }
                        }
                    }
                    pullRequests {
                        totalCount
                    }
                    releases {
                        totalCount
                    }
                    licenseInfo {
                        id
                        name
                        pseudoLicense
                    }
                    url
                }
            }
        }
    }'''
    query2 = '''
    query($query_string : String!, $end_cursor : String) {
        rateLimit{
            cost
            remaining
            resetAt
        }
        search(query: $query_string, type: REPOSITORY, first:50, after: $end_cursor) {
            pageInfo {
                hasNextPage
                endCursor
            }
            repositoryCount
            nodes {
                ... on Repository {
                    owner {
                        login
                    }
                    id
                    name
                    description
                    createdAt
                    pushedAt
                    isDisabled
                    isMirror
                    isFork
                    isLocked
                    diskUsage
                    primaryLanguage {
                        name
                    }
                    languages(first:100) {
                        totalCount
                        nodes {
                            name
                        }
                    }
                    contributors : mentionableUsers {
                        totalCount
                    }
                    watchers {
                        totalCount
                    }
                    stargazers {
                        totalCount
                    }
                    forkCount
                    issues {
                        totalCount
                    }
                    commits : defaultBranchRef {
                        target {
                            ... on Commit {
                                history {
                                    totalCount
                                }
                            }
                        }
                    }
                    pullRequests {
                        totalCount
                    }
                    releases {
                        totalCount
                    }
                    licenseInfo {
                        id
                        name
                        pseudoLicense
                    }
                    url
                }
            }
        }
    }'''
    if(end_cursor == "") : return (query, variables)
    else : return (query2, variables)

In [71]:
# Runs the query and iterates through all pages of repositories
def iterate_queries( conn, create_dict_method ):

    end_cursor = ""
    end_cursor_string = ""
    hasNextPage = True
    index = 0
    
    print("[WORKING] Running script to collect all repositories. ")
    while( hasNextPage ):
        print("[WORKING] On page " + str(index))
        query = setup_query( end_cursor )
        result = run_query( query[0], query[1] )
        #print(json.dumps(result))
        
        try: 
            result["errors"]
            print(json.dumps(result))
            print("[FAILURE] ERROR DETECTED")
            break
        except KeyError:
            print('[SUCCESS] NO ERROR')
        
        insert_repositories( conn, result, create_dict_method )

        try:
            # if there is a next page, update the endcursor string and continue loop
            if( result["data"]["search"]["pageInfo"]["hasNextPage"] ):
                end_cursor = result["data"]["search"]["pageInfo"]["endCursor"]
                #end_cursor_string = f', after:"{end_cursor}"'
            else:
                hasNextPage = False
        except KeyError:
            print("[WORKING] No next page. ")
            break 

        index += 1
        time.sleep(1)

In [72]:
def create_repo_tuple( node ) :
    primaryLanguage = "null"
    if(node["primaryLanguage"] != None) :
        primaryLanguage = node["primaryLanguage"]["name"]
        
    license_id = "0"
    license_name = "null"
    pseudoLicense = False
    if(node["licenseInfo"] != None) :
        license_id = node["licenseInfo"]["id"]
        license_name = node["licenseInfo"]["name"]
        pseudoLicense = node["licenseInfo"]["pseudoLicense"]
        
    return (
    node["id"], 
    node["name"], 
    node["owner"]["login"], 
    node["createdAt"], 
    node["isMirror"], 
    node["isFork"], 
    node["diskUsage"], 
    primaryLanguage,
    node["contributors"]["totalCount"], 
    node["watchers"]["totalCount"], 
    node["stargazers"]["totalCount"], 
    node["forkCount"], 
    node["issues"]["totalCount"], 
    node["commits"]["target"]["history"]["totalCount"], 
    node["pullRequests"]["totalCount"], 
    node["releases"]["totalCount"],
    license_id, 
    license_name, 
    pseudoLicense, 
    node["url"]
    )

In [73]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
        
    return conn

In [74]:
def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

In [75]:
# insert repository tuples into the sqlite database
def insert_repositories( conn, result, create_dict_method ):
    
    data_list = []
    nodes = result["data"]["search"]["nodes"]
    for node in nodes:
        data_list.append( create_dict_method( node ) )
    
    """
    Create a new repository into the repository table
    :param conn:
    :param repository:
    :return: project id
    """
    sql = """INSERT or IGNORE INTO repositories (id, name, owner, start_date, isMirror, isFork, diskUsage, 
                     primaryLanguage, numContributors, watchers, stars, forks, issues, commits, 
                     pullRequests, releases, license_id, license_name, pseudoLicense, url) VALUES
                     (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""
    cur = conn.cursor()
    cur.executemany(sql, data_list)
    print("[SUCCESS] Inserted repos into the database. ")
    return cur.lastrowid

In [76]:
# name of the sqlite file
database = r".\databases\research2.db"

# create a database connection
conn = create_connection(database)

In [77]:
# sql for creating the repos table
sql_create_repos_table = """CREATE TABLE IF NOT EXISTS repositories (
                                        id text PRIMARY KEY NOT NULL,
                                        name text NOT NULL,
                                        owner text NOT NULL,
                                        start_date text NOT NULL,
                                        isMirror boolean NOT NULL,
                                        isFork boolean NOT NULL,
                                        diskUsage int NOT NULL,
                                        primaryLanguage text NOT NULL,
                                        numContributors int NOT NULL,
                                        watchers int NOT NULL,
                                        stars int NOT NULL,
                                        forks int NOT NULL,
                                        issues int NOT NULL,
                                        commits int NOT NULL,
                                        pullRequests int NOT NULL,
                                        releases int NOT NULL,
                                        license_id text NOT NULL,
                                        license_name text NOT NULL,
                                        pseudoLicense boolean NOT NULL,
                                        url text NOT NULL
                                    ); """

with conn:
    create_table(conn, sql_create_repos_table)

In [None]:
# when conn is valid
with conn:
    # run query to find all repositories
    # pass in the repository tuple builder
    # insert the tuple into the database
    iterate_queries( conn, create_repo_tuple )

[WORKING] Running script to collect all repositories. 
[WORKING] On page 0
{"data": {"rateLimit": {"cost": 1, "remaining": 4989, "resetAt": "2020-02-12T21:00:53Z"}, "search": {"pageInfo": {"hasNextPage": true, "endCursor": "Y3Vyc29yOjUw"}, "repositoryCount": 7395800, "nodes": [{"id": "MDEwOlJlcG9zaXRvcnkxMzYzMzI4MTY=", "name": "jd-assistant", "owner": {"login": "tychxn"}, "createdAt": "2018-06-06T13:29:23Z", "isMirror": false, "isFork": false, "diskUsage": 6983, "primaryLanguage": {"name": "Python"}, "languages": {"totalCount": 3, "nodes": [{"name": "Python"}, {"name": "HTML"}, {"name": "JavaScript"}]}, "contributors": {"totalCount": 3}, "watchers": {"totalCount": 41}, "stargazers": {"totalCount": 1000}, "forkCount": 285, "issues": {"totalCount": 93}, "commits": {"target": {"history": {"totalCount": 160}}}, "pullRequests": {"totalCount": 9}, "releases": {"totalCount": 0}, "licenseInfo": {"id": "MDc6TGljZW5zZTEz", "name": "MIT License", "pseudoLicense": false}, "url": "https://github.co

[WORKING] On page 1
{"data": {"rateLimit": {"cost": 1, "remaining": 4988, "resetAt": "2020-02-12T21:00:52Z"}, "search": {"pageInfo": {"hasNextPage": true, "endCursor": "Y3Vyc29yOjEwMA=="}, "repositoryCount": 7395802, "nodes": [{"owner": {"login": "hoshsadiq"}, "id": "MDEwOlJlcG9zaXRvcnkxMDM4NTgzNzQ=", "name": "adblock-nocoin-list", "description": "Block lists to prevent JavaScript miners ", "createdAt": "2017-09-17T20:10:24Z", "pushedAt": "2020-01-01T18:15:45Z", "isDisabled": false, "isMirror": false, "isFork": false, "isLocked": false, "diskUsage": 514, "primaryLanguage": null, "languages": {"totalCount": 0, "nodes": []}, "contributors": {"totalCount": 19}, "watchers": {"totalCount": 51}, "stargazers": {"totalCount": 992}, "forkCount": 69, "issues": {"totalCount": 143}, "commits": {"target": {"history": {"totalCount": 597}}}, "pullRequests": {"totalCount": 256}, "releases": {"totalCount": 0}, "licenseInfo": {"id": "MDc6TGljZW5zZTEz", "name": "MIT License", "pseudoLicense": false}, "ur

[WORKING] On page 2
{"data": {"rateLimit": {"cost": 1, "remaining": 4987, "resetAt": "2020-02-12T21:00:53Z"}, "search": {"pageInfo": {"hasNextPage": true, "endCursor": "Y3Vyc29yOjE1MA=="}, "repositoryCount": 7395818, "nodes": [{"owner": {"login": "fireeye"}, "id": "MDEwOlJlcG9zaXRvcnk2MjA4NzA2OQ==", "name": "flare-fakenet-ng", "description": "FakeNet-NG - Next Generation Dynamic Network Analysis Tool", "createdAt": "2016-06-27T20:46:39Z", "pushedAt": "2020-02-12T18:51:31Z", "isDisabled": false, "isMirror": false, "isFork": false, "isLocked": false, "diskUsage": 1292, "primaryLanguage": {"name": "Python"}, "languages": {"totalCount": 3, "nodes": [{"name": "Python"}, {"name": "HTML"}, {"name": "Shell"}]}, "contributors": {"totalCount": 24}, "watchers": {"totalCount": 102}, "stargazers": {"totalCount": 978}, "forkCount": 225, "issues": {"totalCount": 71}, "commits": {"target": {"history": {"totalCount": 290}}}, "pullRequests": {"totalCount": 64}, "releases": {"totalCount": 3}, "licenseInf

[WORKING] On page 3
