## Retrieve Data from SourceGraph

In [None]:
! pip install -Uqqq requests

[?25l[K     |█████▏                          | 10 kB 21.3 MB/s eta 0:00:01[K     |██████████▍                     | 20 kB 16.9 MB/s eta 0:00:01[K     |███████████████▋                | 30 kB 10.7 MB/s eta 0:00:01[K     |████████████████████▉           | 40 kB 6.2 MB/s eta 0:00:01[K     |██████████████████████████      | 51 kB 7.5 MB/s eta 0:00:01[K     |███████████████████████████████▎| 61 kB 8.0 MB/s eta 0:00:01[K     |████████████████████████████████| 62 kB 1.5 MB/s 
[?25h

In [None]:
import json
import time
import datetime
import requests
import os
from pathlib import Path
import concurrent.futures

In [None]:
# directory to store files defaults to current directory
OUTPUT_DIR = "."

## Searching for test files using SourceGraph GraphQL API

### Collect All Hits
See [here](https://docs.sourcegraph.com/cli/how-tos/creating_an_access_token) for how to get your own sourcegraph API token

In [None]:
sg_token = "ENTER SOURCEGRAPH_API_TOKEN"

payload = {
  "query": "query ($query: String!) {\n  search(query: $query, version: V2) {\n    results {\n      results {\n        __typename\n        ... on FileMatch {\n          ...FileMatchFields\n        }\n      }\n      matchCount\n      elapsedMilliseconds\n    }\n  }\n}\n\nfragment FileMatchFields on FileMatch {\n  repository {\n    name\n    url\n  }\n  file {\n    name\n    url\n  }\n}",
  "variables": {
    "query": "file:.*_test.py file:has.content(import unittest) count:1000000 fork:no lang:python"
  }
}

destination = f"{OUTPUT_DIR}/datasets/raw_datasets.json"
path = Path(destination)

if not(path.exists()) :
  r = requests.post('https://sourcegraph.com/.api/graphql', json=payload, headers={'Authorization': f"token {sg_token}"}, stream=True)
  status = r.status_code

  if status == 200 :
      with open(destination, 'w+') as file:
        data = r.json()["data"]["search"]["results"]
        json.dump(data, file, indent = 4)
  else:
    print(f"Status Code: {status}")

### Filter Hits

#### By existence of Source File

In [None]:
def search_source(query, schema):
  payload = {
    "query": schema,
    "variables": {
      "query": query
    }
  }

  try:
    r = requests.post('https://sourcegraph.com/.api/graphql', json=payload, headers={'Authorization': f"token {sg_token}"}, stream=True)
  except:
    r = requests.post('https://sourcegraph.com/.api/graphql', json=payload, headers={'Authorization': f"token {sg_token}"}, verify=False)

  status = r.status_code

  if status == 200 :
    data = r.json()['data']['search']

    if data == None:
      return

    results = data["results"]["results"]

    search_count = data["results"]['matchCount']

    if search_count == 1:
      result = results[0]
      return result
      
  else:
    print(f"\rStatus Code: {status}", end="")
    print(f"\r{r.headers}", end="")
    if(status == 500):
      retry = int(r.headers["Retry-After"])
      time.sleep(retry)
      
source_files = dict()
source_test_map = dict()

def map_source_to_test(test_result):
  source_file = test_result["file"]["name"].replace("_test", "")
  source_test_mapping = dict()
  # If a source file is found in the list of successfully retrieved source files add the pair to a source-test map
  if source_file in source_files.keys():
    source_test_mapping["source"] = source_files[source_file]["source_file"]
    source_test_mapping["test"] = test_result.pop("file")

    source_test_map[source_file] = source_test_mapping

schema = "query ($query: String!) {\n  search(query: $query, version: V2) {\n    results {\n      results {\n        __typename\n        ... on FileMatch {\n          ...FileMatchFields\n        }\n      }\n      matchCount\n    }\n  }\n}\n\nfragment FileMatchFields on FileMatch {\n  file {\n    name\n    url\n  }\n}"

def create_source_test_map():
  with open(destination, 'r+') as file:
    data = json.load(file)

    fileCount = 0
    
    sg_queries = []

    test_results = data["results"]

    # Create queries to find corresponding source files for each test file and prep a dictionary
    for test_result in test_results:
      repo = test_result["repository"]["name"]
      test_file = test_result["file"]["name"]
      source_file = test_file.replace("_test", "")
      del test_result["__typename"]
      del test_result["repository"]

      sg_query = f"repo:{repo} (file:.*/{source_file}) fork:no lang:python"

      sg_queries.append(sg_query)

  # This should occur prior to calling create_source_test_map (indentation issues)      
  with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
    fileCount = 0
    duplicates = 0
    # Check if a source file can be found on the server using our query
    for query in sg_queries:
      result = executor.submit(search_source, query, schema).result()
      
      if result:
        file_name = result["file"]["name"]
        if file_name not in source_files.keys():
          source_files[file_name] = {"source_file": {"name" : file_name, "url" : result["file"]["url"]}}
          fileCount += 1
          print("\r{0:.2f} % progess with source file search".format(100 * fileCount/(len(sg_queries)-duplicates)), end="")
        else:
          duplicates += 1

  with open(destination.replace("raw", "filtered"), 'w+') as file:    
      fileCount = 0
      
      source_file_count = len(source_files.keys())
      # For each test file perform the source to test mapping
      with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
        for test_result in test_results:
          executor.submit(map_source_to_test, test_result)
          fileCount += 1
          
          print("\r{0:.2f} % progess with source test mapping".format(100 * fileCount/source_file_count), end="")
      
      json.dump({"results": source_test_map}, file, indent = 4)
          
try:
  with open(destination.replace("raw", "filtered"), 'r+') as file:
    data = json.load(file)

except:
  create_source_test_map()

destination = destination.replace("raw", "filtered")
path = Path(destination)

731.44 % progess with source test mapping

#### By License and Save

In [None]:
licenses = [
    "\"Licensed under the Apache License, Version 2.0\"", # Apache License 2.0
    "\"Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>\"", # GNU AGPLv3, GNU GPLv3, GNU LGPLv3
    "\"Mozilla Public License Version 2.0\"", # Mozilla Public License Version 2.0,
    "\"Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\"", # BSD License
    "\"MIT License\"" # MIT License
]

with open(destination, 'r+') as file:
    data = json.load(file)
    results = data["results"]
    files = results.keys()

fileCount = 0

src_folder = destination.replace("filtered_datasets.json", "src")

if(not(Path(src_folder).exists())):
  os.makedirs(src_folder)
  os.makedirs(src_folder.replace("src", "test"))

with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
  for file in files:
    repo = '/'.join(results[file]["source"]["url"].split('/')[:3]).replace('/', '', 1)
    query = f"repo:{repo}(file:{file} content:{' OR '.join(licenses)})"
    schema = schema.replace("url", "content")
    
    result = executor.submit(search_source, query, schema).result()

    if result:
      source_content = result["file"]["content"]

      source_path = os.path.join(src_folder, file)
      

      test_file = file.replace(".py", "_test.py")
      
      query = f"repo:{repo}(file:{test_file})"

      result = executor.submit(search_source, query, schema).result()

      if result:
      # Ensure both source and test are valid first
        if (not(Path(source_path).exists())):
          with open(source_path, "w") as f:
            f.write(source_content)

        test_content = result["file"]["content"]

        test_path = os.path.join(src_folder.replace('src', 'test'), test_file)
        
        if (not(Path(test_path).exists())):
          with open(test_path, "w") as f:
            f.write(test_content)

        fileCount += 1
        print(f"\r{fileCount} file(s) out of {len(files)}", end="")

1572 file(s) out of 14996