# Webscraping Project from GITHUB

In [1]:
# Objective is to get details of all top trending topics on Github
# We need to get the Topic Name, Description and Topic Link on first page
# We need to get details of top repositories of all top topics and create as csv file for each topic
# We need username, repo name, star rating and link of each repository of each topic. 

In [2]:
#Import all libraries needed and rest as you work on
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [3]:
#There are many functions in this code but the beggining is here even though this is not the first fucntion
# This function helps us get all the top trending topics on the page of github

def scrape_topics():
    topics_url = "https://github.com/topics"
    response = requests.get(topics_url)
        
    from bs4 import BeautifulSoup
    doc = BeautifulSoup(response.text, 'html.parser')
    
    topics_dict = {
    'Titles' : get_topic_titles(doc),
    'Descriptions' : get_topic_desc(doc),
    'Links' : get_topic_links(doc) }
  
    return pd.DataFrame(topics_dict)


#The below 3 functions are the helper functions to the above function to retrieve details of each topic

def get_topic_titles(doc):
    
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p',{'class': selection_class})
    topic_titles = []
    
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles


def get_topic_desc(doc):
    
    desc_selector_class = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p',{'class': desc_selector_class})
    topic_descs = []
    
    for tag in topic_desc_tags:
        topic_descs.append(tag.text)
    return topic_descs

def get_topic_links(doc):
    
    link_selector = 'no-underline flex-grow-0'
    title_links = doc.find_all('a',{'class':link_selector})
    topic_links = []
    
    for tag in title_links:
        topic_links.append("https://github.com"+tag['href'])
    return topic_links


#After all the above functions we will have a dictionary / data frame which contains top 30 Titles, descriptions and links of topics
    

In [4]:
#This is the second phase of execution
#Now that we have topic names and URl's, we will scrape repo details of each topic through below function.

def scrape_topic(path,topic_url):
    import os
    if os.path.exists(path):
        print("The file {} already exists..so skipping ".format(path))
        return
        
    topic_df = get_topic_repos(get_topic_doc(topic_url))
    topic_df.to_csv(path,index = None)
    
    return topic_df

#The final CSV files are created after the above function

#The below helper function get us the data inside each link of topic and gives us all content.

def get_topic_doc(topic_url):
    
    #Download page from link
    response = requests.get(topic_url)
    #Check for response
    if response.status_code != 200:
        raise Exception("Failed to load page {}".format(topic_url))
        pass
    #Parse using beautifulsoup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    
    return topic_doc

#The below helper function get us the data from all the tags got from previous function

def parse_star_count(star_tags):
    if star_tags.text[-1] == 'k':
        return int(float(star_tags.text[:-1])*1000)
    return(int(star_tags.text))

def get_repo_info(h3_tag,star_tags):
    #returns all about repo
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = "https://github.com"+ a_tags[1]['href']
    stars = parse_star_count(star_tags)
    
    return username,repo_name,repo_url,stars


def get_topic_repos(topic_doc):

    #Get H3 tags which has username, repo name and repo url
    username_tags = topic_doc.find_all('h3',{'class':'f3 color-fg-muted text-normal lh-condensed'})
    
    #get span tag which has stars
    star_tags = topic_doc.find_all('span', {'class':'Counter js-social-count'})
    
    #Get repo infos
    
    topic_repos_dict = {
    'username': [],
    'repo_name':[],
    'stars': [],
    'repo_url':[]
                        }

    for i in range(len(username_tags)):
        repo_info = get_repo_info(username_tags[i],star_tags[i])
        topic_repos_dict["username"].append(repo_info[0])
        topic_repos_dict["repo_name"].append(repo_info[1])
        topic_repos_dict["stars"].append(repo_info[3])
        topic_repos_dict["repo_url"].append(repo_info[2])
    
    
    return(pd.DataFrame(topic_repos_dict))
    
    
    

In [5]:
#This is the parent function which triggers the execution of the whole code

def scrape_topics_repos():
    import os

    print("scraping list of topics")
    topics_df = scrape_topics()
    
    os.makedirs('data', exist_ok = True)
    
    for index,row in topics_df.iterrows():
        print("scraping top repositories for"+row["Titles"])
        scrape_topic('data/'+ row['Titles']+'.csv',row['Links'])
    
    
    

In [6]:
scrape_topics_repos()

scraping list of topics
scraping top repositories for3D
The file data/3D.csv already exists..so skipping 
scraping top repositories forAjax
The file data/Ajax.csv already exists..so skipping 
scraping top repositories forAlgorithm
The file data/Algorithm.csv already exists..so skipping 
scraping top repositories forAmp
The file data/Amp.csv already exists..so skipping 
scraping top repositories forAndroid
The file data/Android.csv already exists..so skipping 
scraping top repositories forAngular
The file data/Angular.csv already exists..so skipping 
scraping top repositories forAnsible
The file data/Ansible.csv already exists..so skipping 
scraping top repositories forAPI
The file data/API.csv already exists..so skipping 
scraping top repositories forArduino
The file data/Arduino.csv already exists..so skipping 
scraping top repositories forASP.NET
The file data/ASP.NET.csv already exists..so skipping 
scraping top repositories forAtom
The file data/Atom.csv already exists..so skipping

In [7]:
#If you want to get details all the pages of topics and all the repositories of each topic then...
# The easiest hack is to loop over the scrape_topics_repos() function but by passing a values of number of pages of topics and number of pages of repositories to the respective functions
# We need create a variable and add to the def scrape_topics(): topics_url = "https://github.com/topics?page=<Variable>
# We need to create a variable and add to the scrape_topics_repos() with row['Links']+'?page=<Variable>' and create loops
# This is a project to show the surface of capabilties of webscraping and much more can be accomplished with more time.
#I have done this project with the help of multiple tutorials and past experience in 6 hours of time.