In [47]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import subprocess
from tqdm import tqdm
from selenium import webdriver
import json
import statistics
import time

In [10]:

USER = 'Kaggle' # will take input from the user instead hardcoded
IS_ORG = True
BASE_URL = 'https://github.com'

In [65]:
def get_repo_names(page, user, is_org = False):
    url = f'{BASE_URL}/orgs/{user}/repositories?page={page}' if is_org else f'{BASE_URL}/{user}?tab=repositories&page={page}'
    repo_names = []

    if is_org:
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(0.5)
        html = driver.page_source
        driver.quit()
        soup = BeautifulSoup(html, 'html.parser')
        repo_elements = soup.find_all('h4', class_=lambda c: c and 'Title-module__heading' in c) 

        for element in repo_elements:
            name_tag = element.find('a')
            name = name_tag.text.strip() if name_tag else None
            if name:
                repo_names.append(name)
    else :
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        repo_elements = soup.find_all('h3', {'class': 'wb-break-all'})
        for element in repo_elements:
            name = name_tag.text.strip() if name_tag else None
            if name:
                repo_names.append(name)
    return repo_names

    

In [66]:

repo_urls = []
repo_names = []
page = 1
while True:
    names = get_repo_names(page, USER, IS_ORG)
    if len(names) == 0:
        break
    repo_names.extend(names)
    for name in names:
        repo_url = f'{BASE_URL}/{USER}/{name}'
        repo_urls.append(repo_url)
    page +=1
    


In [64]:
print("Total repo names : ", len(repo_names))
print("Total repo links : ", len(repo_urls))
print(repo_names[:5])
print(repo_urls[:5])

Total repo names :  12
Total repo links :  12
['kaggle-api', 'kaggle-environments', 'kagglehub', 'docker-python', 'docker-rstats']
['https://github.com/Kaggle/kaggle-api', 'https://github.com/Kaggle/kaggle-environments', 'https://github.com/Kaggle/kagglehub', 'https://github.com/Kaggle/docker-python', 'https://github.com/Kaggle/docker-rstats']


In [14]:
dir_path = f'clones/{USER}'
os.makedirs(dir_path, exist_ok=True)

In [15]:
df = pd.DataFrame({
    'name' : repo_names,
    'url' : repo_urls
})
df.to_csv(f'{dir_path}/{USER}_urls.csv', index=False)

In [16]:
for url in tqdm(repo_urls):
    subprocess.run(['git', 'clone', url], cwd=dir_path)


  0%|          | 0/12 [00:00<?, ?it/s]Cloning into 'kaggle-api'...
  8%|▊         | 1/12 [00:03<00:40,  3.66s/it]Cloning into 'kaggle-environments'...
 17%|█▋        | 2/12 [00:06<00:33,  3.38s/it]Cloning into 'kagglehub'...
 25%|██▌       | 3/12 [00:10<00:33,  3.69s/it]Cloning into 'docker-python'...
 33%|███▎      | 4/12 [01:04<03:07, 23.50s/it]Cloning into 'docker-rstats'...
 42%|████▏     | 5/12 [01:06<01:48, 15.52s/it]Cloning into 'docker-rcran'...
 50%|█████     | 6/12 [01:07<01:04, 10.80s/it]Cloning into 'learntools'...
 58%|█████▊    | 7/12 [01:20<00:57, 11.48s/it]Cloning into 'kagglesdk'...
 67%|██████▋   | 8/12 [01:21<00:32,  8.20s/it]Cloning into '.allstar'...
 75%|███████▌  | 9/12 [01:23<00:18,  6.05s/it]Cloning into 'jupyterlab'...
 83%|████████▎ | 10/12 [01:24<00:09,  4.66s/it]Cloning into 'docker-julia'...
 92%|█████████▏| 11/12 [01:25<00:03,  3.60s/it]Cloning into 'pipelinehelpers'...
100%|██████████| 12/12 [01:27<00:00,  7.27s/it]


In [17]:
languages = ["Python","JavaScript","Java","TypeScript","C#","C++","Go","PHP","Ruby","Bourne Shell"]

path = f'clones/{USER}'
lang_stats = {}

for lang in languages:
    lang_stats[lang] = []

for repo in repo_names:
    repo_path = os.path.join(path, repo)

    result = subprocess.run(
        ["cloc", repo_path, f"--include-lang={','.join(languages)}", "--json"],
        capture_output=True,
        text=True
    )

    data = json.loads(result.stdout)
    repo_summary = {"repo_name": repo, "total": 0}

    for lang in languages:
        count = data.get(lang, {}).get("code", 0)
        lang_stats[lang].append(count)

In [18]:
print(lang_stats)

{'Python': [20337, 16772, 7014, 3437, 0, 0, 12078, 10489, 0, 0, 0, 0], 'JavaScript': [0, 6979, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0], 'Java': [0, 1608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'TypeScript': [0, 1849, 0, 0, 0, 0, 0, 0, 0, 1131, 0, 0], 'C#': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'C++': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'Go': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'PHP': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'Ruby': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'Bourne Shell': [324, 300, 0, 8, 7, 502, 611, 0, 0, 3, 0, 0]}


In [19]:
total_summary = {}
for lang in languages:
    lang_counts = lang_stats[lang]
    print(lang, lang_counts)
    total = sum(lang_counts)
    median = statistics.median(lang_counts) if lang_counts else 0
    total_summary[lang] = {
        "total" : total,
        "median" : median
    }

Python [20337, 16772, 7014, 3437, 0, 0, 12078, 10489, 0, 0, 0, 0]
JavaScript [0, 6979, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0]
Java [0, 1608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
TypeScript [0, 1849, 0, 0, 0, 0, 0, 0, 0, 1131, 0, 0]
C# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
C++ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Go [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
PHP [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Ruby [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Bourne Shell [324, 300, 0, 8, 7, 502, 611, 0, 0, 3, 0, 0]


In [20]:
print(total_summary)

{'Python': {'total': 70127, 'median': 1718.5}, 'JavaScript': {'total': 6985, 'median': 0.0}, 'Java': {'total': 1608, 'median': 0.0}, 'TypeScript': {'total': 2980, 'median': 0.0}, 'C#': {'total': 0, 'median': 0.0}, 'C++': {'total': 0, 'median': 0.0}, 'Go': {'total': 0, 'median': 0.0}, 'PHP': {'total': 0, 'median': 0.0}, 'Ruby': {'total': 0, 'median': 0.0}, 'Bourne Shell': {'total': 1755, 'median': 5.0}}


In [21]:
with open(f'{USER}_language_summary.json', 'w') as f:
    json.dump(total_summary, f)

In [22]:
print(f'{USER}_language_summary.json created successfully' )

Kaggle_language_summary.json created successfully
