# Notebook dedicated to the github parsing for our RIMEL project
## Authors : 
- Fabien Airaud
- Axel Delille
- Yvann Ponce
- Apoorva Srinivas Appadoo
---

In [1]:
%pip install selenium

Collecting seleniumNote: you may need to restart the kernel to use updated packages.

  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sniffio>=1.3.0 (from trio~=0.17->selenium)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting wsproto>=0.14 (fro

In [63]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from dotenv import load_dotenv
from os import getenv

import urllib.parse
import json


First step is to prepare all variables required

In [64]:
def load_and_process_config(file_path):
    with open(file_path, 'r') as file:
        config_data = json.load(file)
    return config_data

In [68]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")

# Fetching github credentials
load_dotenv()
githubUsername = getenv("GH_USERNAME")
githubPassword = getenv("GH_PASSWORD")

# Config filepath, relative to this file
config = "conf.json"

Here we define the utilities method to parse the github search page

In [69]:
def convert_github_placeholder_to_int(value_str):
    # Remove any extra spaces
    value_str = value_str.strip()

    # Handle different suffixes
    if value_str.endswith('k'):
        return int(float(value_str[:-1]) * 1000)
    elif value_str.endswith('M'):
        return int(float(value_str[:-1]) * 1000000)
    elif value_str.endswith('B'):
        return int(float(value_str[:-1]) * 1000000000)
    else:
        # If no suffix, simply convert the string to an integer
        return int(value_str)

def parseGithubSearch(driver: webdriver, url: str) -> int :
    driver.get(url)

    # Wait for the page to be fully loaded
    WebDriverWait(driver, 10).until(
        lambda d: d.execute_script('return document.readyState') == 'complete'
    )
    
    # Step 6: Perform any further actions (for example, scraping the page)
    # Locate the div with class "Box-sc-g0xbh4-0 cAMcRf" and find the span inside it
    span_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((
            By.XPATH, "//div[contains(@class, 'Box-sc-g0xbh4-0') and contains(@class, 'cAMcRf')]//span[contains(@class, 'prc-Text-Text-0ima0')]"
        ))
    )

    # Extract the text from the span
    span_text = span_element.text
    return convert_github_placeholder_to_int(span_text)

def generateGithubParsingFromParams(driver, codeToSearch: str, extensions: list[str]) -> int :
    extensionString : str = ""
    
    if not extensions or extensions == [] :
        extensionString = ""  # Return empty string if the list is empty
    
    # If there's only one extension, don't add 'OR'
    if len(extensions) == 1:
        extensionString = f"path:*.{extensions[0]}"
    
    if len(extensions) > 1 :
        extensionString = f'({" OR ".join([f"path:*.{ext}" for ext in extensions])})'
    
    query : str = f'"{codeToSearch}" {extensionString}'
    finalString = f'https://github.com/search?q={urllib.parse.quote(query)}&type=code'
    return parseGithubSearch(driver, finalString)

def process_config_and_fetch_numbers(driver, file_path):
    config = load_and_process_config(file_path)

    for item in config:
        codeToSearch = item['query']
        extensions = item['extensions']
        number_of_results = generateGithubParsingFromParams(driver, codeToSearch, extensions)
        item["number"] = number_of_results
    
    return config

Now that everything is set up we'll do the parsing for real

In [70]:
# Step 1: Go to GitHub login page
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://github.com/login")

# Step 2: Wait for the username and password fields to be visible and fill them
username_field = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.ID, "login_field"))
)
password_field = driver.find_element(By.ID, "password")

# Fill in the login credentials
username_field.send_keys(githubUsername)
password_field.send_keys(githubPassword)

# Submit the form by clicking the "Sign in" button
login_button = driver.find_element(By.NAME, "commit")
login_button.click()

# Step 3: Wait for the login to complete (you can adjust this based on what you want to check)
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//span[contains(text(),'Pull requests')]"))
)

# Step 4: Now you're logged in, you can go to your search page or other pages
results = process_config_and_fetch_numbers(driver, config)

# Step 7: Close the browser
driver.quit()

To be done : Processing the results into beautifull diagrams

In [72]:
print(results)

[{'tool': 'mlflow', 'query': 'import mlflow', 'extensions': ['py', 'ipynb'], 'number': 55800}, {'tool': 'test', 'query': 'test query', 'extensions': ['azerty'], 'number': 0}]
