In [29]:
import requests 
import pandas as pd
from bs4 import BeautifulSoup
import os

def scrape_and_save_topic_info(main_url):
    page_doc = get_page(main_url)
    topics_data = parse_page(page_doc)    
    for i in range(0, len(topics_data['title'])):
        repo_df = get_repo_info(topics_data['url'][i])
        fname = topics_data['title'][i]
        if(os.path.exists(fname)):
            print(f"The file {fname} already exists. Skipping...")
            continue
        repo_df.to_csv(f"{fname}_repos.csv", index=None)
        print(f"Repository information for '{fname}' is saved as {fname}_repos.csv")

def get_page(url):
    # Download the page
    response = requests.get(url)
    #Check successful response 
    #parse using beautiful soup is response is successful 
    if 200 <= response.status_code <= 299:
        page_contents = response.text
        doc = BeautifulSoup(page_contents, 'html.parser')
        return doc
    else:
        raise Exception(f"Unable to fetch the given page {url}")

def parse_page(doc):
    topic_titles = []
    topic_descs = []
    topic_urls = []
    
    title_tags = doc.find_all('p', class_="f3 lh-condensed mb-0 mt-1 Link--primary")
    for tag in title_tags:
        topic_titles.append(tag.text)
        
    desc_tags = doc.find_all('p', class_="f5 color-fg-muted mb-0 mt-1")
    for tag in desc_tags:
        topic_descs.append(tag.text.strip())
        
    link_tags = doc.find_all('a', class_="no-underline flex-1 d-flex flex-column")
    base_url = "https://github.com/topic/"
    
    for title in topic_titles:
        topic_urls.append(base_url + title)

    topic_dict = {
        'title': topic_titles,
        'description': topic_descs,
        'url': topic_urls
    }
    
    topic_df = pd.DataFrame(topic_dict)
    topic_df.to_csv('topics.csv', index=None)
    
    return topic_dict

def get_repo_info(topic_url):
    soup = get_page(topic_url)
    h3_tags = soup.find_all('h3', class_="f3 color-fg-muted text-normal lh-condensed")
    
    repo_names = [tag.find_all('a')[1].text.strip() for tag in h3_tags[:10]]
    
    usernames = [tag.find_all('a')[0].text.strip() for tag in h3_tags[:10]]
    
    star_tags = soup.find_all('span', {'id': "repo-stars-counter-star"})
    stars_set = [int(float(tag.text[:-1]) * 1000) if tag.text[-1] == 'k' else int(tag.text) for tag in star_tags[:10]]     
    
    repo_urls = [f"https://github.com/topics{tag.find_all('a')[0]['href']}" for tag in h3_tags[:10]]

    repo_dict = {
        'repo name': repo_names,
        'user name': usernames,
        'stars count': stars_set,
        'repo url': repo_URLS
    }
    
    repo_df = pd.DataFrame(repo_dict)
    repo_df.to_csv('repo.csv', index=None)    
    return repo_df

In [30]:
scrape_and_save_topic_info("https://github.com/topics")

Repository information for '3D' is saved as 3D_repos.csv
Repository information for 'Ajax' is saved as Ajax_repos.csv
Repository information for 'Algorithm' is saved as Algorithm_repos.csv
Repository information for 'Amp' is saved as Amp_repos.csv
Repository information for 'Android' is saved as Android_repos.csv
Repository information for 'Angular' is saved as Angular_repos.csv
Repository information for 'Ansible' is saved as Ansible_repos.csv
Repository information for 'API' is saved as API_repos.csv
Repository information for 'Arduino' is saved as Arduino_repos.csv
Repository information for 'ASP.NET' is saved as ASP.NET_repos.csv
Repository information for 'Atom' is saved as Atom_repos.csv
Repository information for 'Awesome Lists' is saved as Awesome Lists_repos.csv
Repository information for 'Amazon Web Services' is saved as Amazon Web Services_repos.csv
Repository information for 'Azure' is saved as Azure_repos.csv
Repository information for 'Babel' is saved as Babel_repos.csv
R

Exception: Unable to fetch the given page https://github.com/topic/C++

In [15]:
import os