In [274]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import subprocess
from tqdm import tqdm
from selenium import webdriver
import json
# !pip install selenium

In [261]:

USER = 'Kaggle' # will take input from the user instead hardcoded
IS_ORG = True
BASE_URL = 'https://github.com'

In [262]:
def get_repo_names(page, user, is_org = False):
    url = f'{BASE_URL}/orgs/{user}/repositories?page={page}' if is_org else f'{BASE_URL}/{user}?tab=repositories&page={page}'
    repo_names = []

    if is_org:
        driver = webdriver.Chrome()
        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        repo_elements = soup.find_all('h4', class_=lambda c: c and 'Title-module__heading' in c) 
        driver.quit()
        for element in repo_elements:
            name = element.find('a').text.strip()
            repo_names.append(name)
    else :
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        repo_elements = soup.find_all('h3', {'class': 'wb-break-all'})
        for element in repo_elements:
            name = element.find('a').text.strip()
            repo_names.append(name)
    return repo_names

    

In [263]:

repo_urls = []
repo_names = []
page = 1
while True:
    names = get_repo_names(page, USER, IS_ORG)
    if len(names) == 0:
        break
    repo_names.extend(names)
    for name in names:
        repo_url = f'{BASE_URL}/{USER}/{name}'
        repo_urls.append(repo_url)
    page +=1
    


In [264]:
print("Total repo names : ", len(repo_names))
print("Total repo links : ", len(repo_urls))
print(repo_names[:5])
print(repo_urls[:5])

Total repo names :  12
Total repo links :  12
['kaggle-api', 'kaggle-environments', 'kagglehub', 'docker-python', 'docker-rstats']
['https://github.com/Kaggle/kaggle-api', 'https://github.com/Kaggle/kaggle-environments', 'https://github.com/Kaggle/kagglehub', 'https://github.com/Kaggle/docker-python', 'https://github.com/Kaggle/docker-rstats']


In [265]:
dir_path = f'clones/{USER}'
os.makedirs(dir_path, exist_ok=True)

In [266]:
df = pd.DataFrame({
    'name' : repo_names,
    'url' : repo_urls
})
df.to_csv(f'{dir_path}/repo_urls.csv', index=False)

In [267]:
for url in tqdm(repo_urls):
    subprocess.run(['git', 'clone', url], cwd=dir_path)


  0%|          | 0/12 [00:00<?, ?it/s]Cloning into 'kaggle-api'...
  8%|▊         | 1/12 [01:02<11:25, 62.31s/it]Cloning into 'kaggle-environments'...
 17%|█▋        | 2/12 [02:02<10:13, 61.35s/it]Cloning into 'kagglehub'...
 25%|██▌       | 3/12 [02:06<05:15, 35.00s/it]Cloning into 'docker-python'...
 33%|███▎      | 4/12 [03:02<05:45, 43.18s/it]Cloning into 'docker-rstats'...
 42%|████▏     | 5/12 [03:03<03:16, 28.10s/it]Cloning into 'docker-rcran'...
 50%|█████     | 6/12 [03:05<01:54, 19.08s/it]Cloning into 'learntools'...
 58%|█████▊    | 7/12 [03:19<01:26, 17.40s/it]Cloning into 'kagglesdk'...
 67%|██████▋   | 8/12 [03:20<00:49, 12.26s/it]Cloning into '.allstar'...
 75%|███████▌  | 9/12 [03:21<00:26,  8.80s/it]Cloning into 'jupyterlab'...
 83%|████████▎ | 10/12 [03:23<00:13,  6.52s/it]Cloning into 'docker-julia'...
 92%|█████████▏| 11/12 [03:24<00:04,  4.89s/it]Cloning into 'pipelinehelpers'...
100%|██████████| 12/12 [03:25<00:00, 17.12s/it]


In [287]:
languages = ["Python", "JavaScript", "TypeScript", "Java"]

path = f'clones/{USER}'
total_repo = len(repo_names)


lang_stats = {}
for lang in languages:
    lang_stats[lang] = []

repo_summaries = []

for repo in repo_names:
    repo_path = os.path.join(path, repo)

    result = subprocess.run(
        ["cloc", repo_path, f"--include-lang={','.join(languages)}", "--json"],
        capture_output=True,
        text=True
    )

    data = json.loads(result.stdout)
    repo_summary = {"repo_name": repo, "total": 0}

    for lang in languages:
        count = data.get(lang, {}).get("code", 0)
        repo_summary[lang] = count
        repo_summary["total"] += count
        lang_stats[lang].append(count)

    repo_summaries.append(repo_summary)

{'repo_name': 'kaggle-api', 'total': 20337, 'Python': 20337, 'JavaScript': 0, 'TypeScript': 0, 'Java': 0}


In [288]:
print(repo_summaries[0])
print(lang_stats)

{'repo_name': 'kaggle-api', 'total': 20337, 'Python': 20337, 'JavaScript': 0, 'TypeScript': 0, 'Java': 0}
{'Python': [20337, 16772, 7014, 3437, 0, 0, 12078, 10489, 0, 0, 0, 0], 'JavaScript': [0, 6979, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0], 'TypeScript': [0, 1849, 0, 0, 0, 0, 0, 0, 0, 1131, 0, 0], 'Java': [0, 1608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
