In [306]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import subprocess
from tqdm import tqdm
from selenium import webdriver
import json
import statistics
# !pip install selenium

In [307]:

USER = 'Cefalo' # will take input from the user instead hardcoded
IS_ORG = True
BASE_URL = 'https://github.com'

In [308]:
def get_repo_names(page, user, is_org = False):
    url = f'{BASE_URL}/orgs/{user}/repositories?page={page}' if is_org else f'{BASE_URL}/{user}?tab=repositories&page={page}'
    repo_names = []

    if is_org:
        driver = webdriver.Chrome()
        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        repo_elements = soup.find_all('h4', class_=lambda c: c and 'Title-module__heading' in c) 
        driver.quit()
        for element in repo_elements:
            name = element.find('a').text.strip()
            repo_names.append(name)
    else :
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        repo_elements = soup.find_all('h3', {'class': 'wb-break-all'})
        for element in repo_elements:
            name = element.find('a').text.strip()
            repo_names.append(name)
    return repo_names

    

In [309]:

repo_urls = []
repo_names = []
page = 1
while True:
    names = get_repo_names(page, USER, IS_ORG)
    if len(names) == 0:
        break
    repo_names.extend(names)
    for name in names:
        repo_url = f'{BASE_URL}/{USER}/{name}'
        repo_urls.append(repo_url)
    page +=1
    


In [310]:
print("Total repo names : ", len(repo_names))
print("Total repo links : ", len(repo_urls))
print(repo_names[:5])
print(repo_urls[:5])

Total repo names :  10
Total repo links :  10
['quick-meet', 'cms-backend-api', 'LetsLearnReact', 'lets-learn-refactoring', 'gotraining']
['https://github.com/Cefalo/quick-meet', 'https://github.com/Cefalo/cms-backend-api', 'https://github.com/Cefalo/LetsLearnReact', 'https://github.com/Cefalo/lets-learn-refactoring', 'https://github.com/Cefalo/gotraining']


In [311]:
dir_path = f'clones/{USER}'
os.makedirs(dir_path, exist_ok=True)

In [312]:
df = pd.DataFrame({
    'name' : repo_names,
    'url' : repo_urls
})
df.to_csv(f'{dir_path}/repo_urls.csv', index=False)

In [313]:
for url in tqdm(repo_urls):
    subprocess.run(['git', 'clone', url], cwd=dir_path)


  0%|          | 0/10 [00:00<?, ?it/s]Cloning into 'quick-meet'...
 10%|█         | 1/10 [00:19<02:52, 19.13s/it]Cloning into 'cms-backend-api'...
 20%|██        | 2/10 [00:20<01:10,  8.82s/it]Cloning into 'LetsLearnReact'...
 30%|███       | 3/10 [00:43<01:46, 15.16s/it]Cloning into 'lets-learn-refactoring'...
 40%|████      | 4/10 [00:44<00:57,  9.66s/it]Cloning into 'gotraining'...
 50%|█████     | 5/10 [00:47<00:36,  7.35s/it]Cloning into 'resume-parser'...
 60%|██████    | 6/10 [00:49<00:21,  5.39s/it]Cloning into 'cms-web-frontend'...
 70%|███████   | 7/10 [00:51<00:12,  4.24s/it]Cloning into 'ResumeParser'...
 80%|████████  | 8/10 [00:54<00:07,  3.86s/it]Cloning into 'lets-learn-tdd'...
 90%|█████████ | 9/10 [01:30<00:13, 13.97s/it]Cloning into 'refactoring-code-smells'...
100%|██████████| 10/10 [01:31<00:00,  9.18s/it]


In [314]:
languages = ["Python", "JavaScript", "TypeScript", "Java"]
path = f'clones/{USER}'
lang_stats = {}

for lang in languages:
    lang_stats[lang] = []

for repo in repo_names:
    repo_path = os.path.join(path, repo)

    result = subprocess.run(
        ["cloc", repo_path, f"--include-lang={','.join(languages)}", "--json"],
        capture_output=True,
        text=True
    )

    data = json.loads(result.stdout)
    repo_summary = {"repo_name": repo, "total": 0}

    for lang in languages:
        count = data.get(lang, {}).get("code", 0)
        lang_stats[lang].append(count)

In [315]:
print(lang_stats)

{'Python': [0, 0, 0, 0, 0, 0, 0, 1082, 0, 0], 'JavaScript': [82, 9, 918, 0, 0, 0, 143, 1, 0, 0], 'TypeScript': [6401, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'Java': [0, 0, 0, 178, 0, 492, 0, 0, 566, 178]}


In [316]:
total_summary = {}
for lang in languages:
    lang_counts = lang_stats[lang]
    print(lang, lang_counts)
    total = sum(lang_counts)
    median = statistics.median(lang_counts)
    total_summary[lang] = {
        "total" : total,
        "median" : median
    }

Python [0, 0, 0, 0, 0, 0, 0, 1082, 0, 0]
JavaScript [82, 9, 918, 0, 0, 0, 143, 1, 0, 0]
TypeScript [6401, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Java [0, 0, 0, 178, 0, 492, 0, 0, 566, 178]


In [317]:
print(total_summary)

{'Python': {'total': 1082, 'median': 0.0}, 'JavaScript': {'total': 1153, 'median': 0.5}, 'TypeScript': {'total': 6401, 'median': 0.0}, 'Java': {'total': 1414, 'median': 0.0}}


In [320]:
with open(f'{USER}_language_summary.json', 'w') as f:
    json.dump(total_summary, f)