## SCRAPPING_GITHUB_TOP_RESPIRATORIES

### Project Outline

- Scrapping data from https://github.com/topics
- Getting a list of topics and for each topic getting a topic title,topic page  url and topic description.
- For each topic getting top 25 repositories from topic page
- For each repository grabbing the repo name, username, stars and repo URL
- For each topic created a csv file 


## Scrape list of topics

- Using requests to download the page
- Use of BS4 to parse and extract information
- Converting to a Pandas DataFrame

#### Downloading the page

In [None]:
import requests
from bs4 import BeautifulSoup

def get_topics_page():
    topic_urls = 'https://github.com/topics'
    response = requests.get(topic_urls)   
    if response.status_code != 200:
        raise Exception ('Failed to load page{}'.format(topic_url))
    doc = BeautifulSoup(response.text, 'html.parser')
    return doc

In [None]:
doc = get_topics_page()

In [None]:
doc.find('a')

#### Helper functions to parse information from the page

In [None]:
def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p', {'class': selection_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

`get_tool_titles` is used to get the list of titles

In [None]:
titles = get_topic_titles(doc)

In [None]:
titles[:5]

##### Similarly defined functions for decriptiond and URLs

In [None]:
def get_topic_descriptions(doc):
    selection_class = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p', {'class': selection_class})
    topic_descriptions = []
    for tag in topic_desc_tags:
         topic_descriptions.append(tag.text.strip())
    return topic_descriptions  


In [None]:
def get_topic_urls(doc):
    topic_link_tags = doc.find_all('a', {'class': 'no-underline flex-grow-0'})
    topic_urls = []
    base_url = 'https://github.com'
    
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
    return topic_urls

#### Putting all together in a single function

In [None]:
def scrape_topics():
    topic_urls = 'https://github.com/topics'
    response = requests.get(topic_urls)   
    if response.status_code != 200:
        raise Exception ('Failed to load page{}'.format(topic_url))
    doc = BeautifulSoup(response.text, 'html.parser')
    topics_dict = {
        'Title' : get_topic_titles(doc),
        'Description' : get_topic_descriptions(doc),
        'URL' : get_topic_urls(doc)
    }   
    return pd.DataFrame(topics_dict)

## Getting the top 25 repositories from topic page

In [None]:
def get_topic_page(topic_url):
    
    # download the page
    response = requests.get(topic_url)
    
   # check successfull response
    if response.status_code != 200:
          raise Exception ('Failed to load page{}'.format(topic_url))
            
   # parse using beautifulsoup
    topic_doc = BeautifulSoup(response.text, 'html.parser')   
    return topic_doc

In [None]:
doc = get_topic_page('https://github.com/topics/3d')

In [None]:
def get_repo_info(h3_tag, star_tag):
    
    #returns all the required information about a repository
    a_tags = h3_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars =  parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url

In [None]:
def get_topic_repos(topic_doc) :
  
   # get the h3 tags containing repo title,repo url and username
    h3_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3',{'class':h3_selection_class })
    
   #  get star tags 
    star_tags = topic_doc.find_all('span',{'class': 'Counter js-social-count'})
    
   # create dict
    topic_repos_dict = {
                'UserName': [],
                'Repo_Name': [],
                'Stars': [],
                'Repo_URL': []}
    
   # get repo_info
    for i in range(len(repo_tags)) :
                repo_info = get_repo_info(repo_tags[i], star_tags[i])
                topic_repos_dict['UserName'].append(repo_info[0])
                topic_repos_dict['Repo_Name'].append(repo_info[1])
                topic_repos_dict['Stars'].append(repo_info[2])
                topic_repos_dict['Repo_URL'].append(repo_info[3])
                
    return pd.DataFrame(topic_repos_dict) 

In [None]:
def scrape_topic(topic_url, topic_name):
    
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(topic_name + 'csv', index = None)

### Putting it all together

- Function to get the list of topics
- Function to create a csv file for scrapped reports from a topic page


In [None]:
def scrape_topics_repos():
    print('Scraping list of topics')
    topic_df = scrape_topics()
    for index, row in topic_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['Title']))
        scrape_topic(row['URL'],row['Title'])

In [None]:
scrape_topics_repos()

Can read and display a CSV file