In [22]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd
import os

base_url ="https://github.com"

In [23]:

def get_titles(doc):
    topics_titles_list= []
    
    select_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topics_titles = doc.find_all('p' , {"class":select_class})
    
    for i in topics_titles:
        topics_titles_list.append(i.text)
    return topics_titles_list

# ===============================================================================================
def get_desc(doc):
    topics_desc_list= []
    
    select_class_desc = 'f5 color-text-secondary mb-0 mt-1'
    topics_desc = doc.find_all('p' , {"class":select_class_desc})
    for i in topics_desc:
        topics_desc_list.append(i.text.strip())
    return topics_desc_list

# ===============================================================================================

def get_url(doc):
    select_url = 'd-flex no-underline'
    topics_url = doc.find_all('a' , {"class":select_url})

    topics_url_list= []
    base_url ="https://github.com"
    
    for i in topics_url:
        topics_url_list.append(base_url + i['href'])
    
    return topics_url_list

# ===============================================================================================


def get_doc(topics_url):
    response= requests.get(topics_url)
    if(response.status_code!= 200):
        raise Exception("Failed to load page")
        
    doc = BeautifulSoup(response.text , 'html.parser') 
    return doc 

# ===============================================================================================


def scrape_topics():
    
    topics_url= 'https://github.com/topics'
    doc= get_doc(topics_url)
    
    topics_dict = {
    "titles" : get_titles(doc),
    "description": get_desc(doc),
    "url" : get_url(doc),
    } 
    
    return pd.DataFrame(topics_dict)

# ===============================================================================================


In [24]:
topic_df= scrape_topics()
topic_df.columns

Index(['titles', 'description', 'url'], dtype='object')

In [25]:
doc = get_doc(topics_url)
titles = get_titles(doc)
description =get_desc(doc)
url = get_url(doc)
len(titles) , len(description),len(url)


(30, 30, 30)

In [26]:
titles[:5] , description[:5] , url[:5]

(['3D', 'Ajax', 'Algorithm', 'Amp', 'Android'],
 ['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
  'Ajax is a technique for creating interactive web applications.',
  'Algorithms are self-contained sequences that carry out a variety of tasks.',
  'Amp is a non-blocking concurrency framework for PHP.',
  'Android is an operating system built by Google designed for mobile devices.'],
 ['https://github.com/topics/3d',
  'https://github.com/topics/ajax',
  'https://github.com/topics/algorithm',
  'https://github.com/topics/amphp',
  'https://github.com/topics/android'])

In [27]:
def parse_star(stars_str):
    if(stars_str[-1] == 'k'):
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)


# ===============================================================================================


def get_repo_doc(topics_url):
    response= requests.get(topics_url)
    if(response.status_code!= 200):
        raise Exception("Failed to load page")
    topic_doc = BeautifulSoup(response.text, 'html.parser') 
    return topic_doc 

# ===============================================================================================

def get_repo_parent(topic_doc):
    
    parent_class = 'f3 color-text-secondary text-normal lh-condensed'
    parent = topic_doc.find_all('h3' , {'class':parent_class})
    
    return parent

# ===============================================================================================
# ===============================================================================================

def get_repo_star(topic_doc):
    star_class = 'social-count float-none'
    star_count = topic_doc.find_all('a' , {'class': star_class}) 
    return star_count

# ===============================================================================================
 
    
def get_repo_info(parent , star_tag):
    # this will return all the required info 
    a_tags = parent.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star(star_tag.text.strip())
    
    return username, repo_name , repo_url ,stars 


# ===============================================================================================

def get_repo_all(topics_url):
    
    
    topic_doc = get_repo_doc(topics_url) 
    
    parent = get_repo_parent(topic_doc)
    
    star_count = get_repo_star(topic_doc)
    
    topic_repo_info ={'username':[],'repo_name':[],'stars':[],'repo_url':[]  }
    
    for i in range(len(parent)):
        repo_info = get_repo_info(parent[i], star_count[i])
        topic_repo_info['username'].append(repo_info[0])
        topic_repo_info['repo_name'].append(repo_info[1])
        topic_repo_info['repo_url'].append(repo_info[2])
        topic_repo_info['stars'].append(repo_info[3])
        
        
    return pd.DataFrame(topic_repo_info)

In [28]:
get_repo_all("https://github.com/topics/angular")


Unnamed: 0,username,repo_name,stars,repo_url
0,justjavac,free-programming-books-zh_CN,81200,https://github.com/justjavac/free-programming-...
1,angular,angular,74600,https://github.com/angular/angular
2,storybookjs,storybook,63300,https://github.com/storybookjs/storybook
3,ionic-team,ionic-framework,44900,https://github.com/ionic-team/ionic-framework
4,leonardomso,33-js-concepts,41500,https://github.com/leonardomso/33-js-concepts
5,prettier,prettier,40100,https://github.com/prettier/prettier
6,SheetJS,sheetjs,26100,https://github.com/SheetJS/sheetjs
7,angular,angular-cli,24700,https://github.com/angular/angular-cli
8,angular,components,21600,https://github.com/angular/components
9,NativeScript,NativeScript,20300,https://github.com/NativeScript/NativeScript


In [29]:

def scrape_topic(topic_url , path): 
    if os.path.exists(path):
        print("skippping a file here ---------")
        return
    
    topic_df = get_repo_all(topic_url)
    topic_df.to_csv(path, index= None)
# ===============================================================================================


def scrape_topics_repos():
    print("List of top topics from Github ")
    topics_df =scrape_topics()
    
    os.makedirs('data', exist_ok = True)
    
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['titles']))
        scrape_topic(row["url"], "data/"+ row["titles"]+".csv")



In [30]:
scrape_topics_repos()

List of top topics from Github 
Scraping top repositories for "3D"
skippping a file here ---------
Scraping top repositories for "Ajax"
skippping a file here ---------
Scraping top repositories for "Algorithm"
skippping a file here ---------
Scraping top repositories for "Amp"
skippping a file here ---------
Scraping top repositories for "Android"
skippping a file here ---------
Scraping top repositories for "Angular"
skippping a file here ---------
Scraping top repositories for "Ansible"
skippping a file here ---------
Scraping top repositories for "API"
skippping a file here ---------
Scraping top repositories for "Arduino"
skippping a file here ---------
Scraping top repositories for "ASP.NET"
skippping a file here ---------
Scraping top repositories for "Atom"
skippping a file here ---------
Scraping top repositories for "Awesome Lists"
skippping a file here ---------
Scraping top repositories for "Amazon Web Services"
skippping a file here ---------
Scraping top repositories for "