# Top Repos For Github Topics

### Pick a website and describe your objective

- Browse through different sites and pick on to scrape. Check the "Project Ideas" section for inspiration.
- Identify the information you'd like to scrape from the site. Decide the format of the output CSV file.
- Summarize your project idea and outline your strategy in a Juptyer notebook. 

### Project Outline:

- We're going to scrape https://github.com/topics
- Get a list of topics. For each topic, we'll get the topic title, topic page URL and topic description
- For each topic , we'll get the top 25 repositories from the topic  page
- For each topic we'll create a CSV file in the following format:

```
RepoName,UserName,Stars,RepoURL
```


### Use the requests library to download web pages

In [5]:
# library needed to scrape gut-hub
import requests
from bs4 import BeautifulSoup
import pandas as pd
import logging as lg

lg.basicConfig(filename='file.log', level=lg.INFO)


## class to scrape the names of the top topics in giyhub topics page
class Github_Scrape_Topics:
    def __init__(self):
        self.topic_url = 'https://github.com/topics'
    
    def scrape_topics(self):
        # getting response from the github topics page
        try:
            response = requests.get(self.topic_url)
            # Check respone Status
            if response.status_code !=200:
                raise Exception(f"Failed to load Page{self.topic_url}")
        except Exception as e:
            print('error github scrape 1',str(e))
            lg.error(str(e))


        # parse web_page 
        try:
            doc = BeautifulSoup(response.text,'html.parser') # using beautiful soup to parse the html page
            topic_selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
            topic_title_tags = doc.find_all('p',{'class' : topic_selection_class}) # finding all the 'p' tags from the class containing topics_name

            topic_description_class = 'f5 color-fg-muted mb-0 mt-1'
            topic_description_tag = doc.find_all('p', {'class' : topic_description_class}) # finding all the 'p' tags from the class containing topics_description

            url_class = 'no-underline flex-1 d-flex flex-column'
            url_tag = doc.find_all('a',{'class' : url_class}) # finding all the 'a' tags from the class containing link to topic repos

            topic_titles = [i.text for i in topic_title_tags] # extracting all the names from the topic_titles
            topic_descriptions = [tags.text.strip() for tags in topic_description_tag] # extracting all the description from the topic_titles

            base_url = 'https://github.com'
            topic_urls = [base_url + tags['href'] for tags in url_tag] # extracting repos url
        except  Exception as e:
            print('error',e)
            lg.error(str(e))

        #creating dataframe to store all the topics_details
 
        topics_df = pd.DataFrame({
        'Topics': topic_titles , 
        'Description': topic_descriptions,
        'url' : topic_urls}) 

        topics_df.to_csv('github_topics.csv',index=None) # saving all the extracted info into a csv file
        return topics_df


## class to scrape all the repos details of individual topic and saving it to csv file
class scrape_repos:
    def __init__(self,topic_url):
        self.topic_url = topic_url
    def get_topic_doc(self):
        # Download the page
        try:
            print(self.topic_url)
            response = requests.get(self.topic_url)
            # Check respone Status
            # if response.status_code !=200:
            #     raise Exception(f"Failed to load Page{self.topic_url}")
            # parse web_page 
            topic_doc = BeautifulSoup(response.text,'html.parser')
            return topic_doc

        except Exception as e:
            print("error",str(e))
            lg.error(str(e))

        
    # function to parse starcounts [eg. 85k ----> 85000]
    def parse_star_count(self,star_str):
        try:
            star_str = star_str.strip()
            if star_str[-1] == 'k':
                return int(float(star_str[:-1])*1000)
            return int(star_str)
        except Exception as e:
            print('error',str(e))

    def get_repo_info(self,h3_tag, star_tag):
        """return repository info"""
        base_url = 'https://github.com'
        try:
            a_tags = h3_tag.find_all('a')
            user_name = a_tags[0].text.strip()
            repo_name = a_tags[1].text.strip()
            repo_url = base_url + a_tags[1]['href']
            stars = self.parse_star_count(star_tag.text)
            return user_name , repo_name , stars , repo_url
        except Exception as e:
            print('error',str(e))
            lg.error(str(e))

    def get_topic_repos(self):
        
        topic_doc = self.get_topic_doc()
        try:
            # get the h3 tag containing username , reponame , repoURL
            h3_class = 'f3 color-fg-muted text-normal lh-condensed'
            repo_tag = topic_doc.find_all('h3',{'class': h3_class})
            # get the span tag containing all the star numbers
            span_id = 'repo-stars-counter-star'
            star_tag = topic_doc.find_all('span', {'id' : span_id})

            repo_info_dict = {
            'topic' : [],
            'user_name' : [],
            'repo_name' : [],
            'stars' : [],
            'repo_url' : []
            }


            #Get repo info

            for i in range(len(repo_tag)):
                repo_info = self.get_repo_info(repo_tag[i] , star_tag[i])
                repo_info_dict['topic'].append(self.topic_url.split('/')[-1])
                repo_info_dict['user_name'].append(repo_info[0])
                repo_info_dict['repo_name'].append(repo_info[1])
                repo_info_dict['stars'].append(repo_info[2])
                repo_info_dict['repo_url'].append(repo_info[3])

            topic_name = self.topic_url.split('/')[-1] + '_repoInfo.csv'


            pd.DataFrame(repo_info_dict).to_csv(topic_name,index=None)
            print(topic_name)
        except Exception as e:
            print('error',str(e))
            lg.error(str(e))

scrape = Github_Scrape_Topics()
df = scrape.scrape_topics()
topic_list = list(df.url)
ndf = pd.DataFrame()
c = 0
for url in topic_list:
    
    repos = scrape_repos(url)
    df = repos.get_topic_repos()
    print(c)
    c+=1


https://github.com/topics/3d
3d_repoInfo.csv
0
https://github.com/topics/ajax
ajax_repoInfo.csv
1
https://github.com/topics/algorithm
algorithm_repoInfo.csv
2
https://github.com/topics/amphp
amphp_repoInfo.csv
3
https://github.com/topics/android
android_repoInfo.csv
4
https://github.com/topics/angular
angular_repoInfo.csv
5
https://github.com/topics/ansible
ansible_repoInfo.csv
6
https://github.com/topics/api
api_repoInfo.csv
7
https://github.com/topics/arduino
arduino_repoInfo.csv
8
https://github.com/topics/aspnet
aspnet_repoInfo.csv
9
https://github.com/topics/atom
atom_repoInfo.csv
10
https://github.com/topics/awesome
awesome_repoInfo.csv
11
https://github.com/topics/aws
aws_repoInfo.csv
12
https://github.com/topics/azure
azure_repoInfo.csv
13
https://github.com/topics/babel
babel_repoInfo.csv
14
https://github.com/topics/bash
bash_repoInfo.csv
15
https://github.com/topics/bitcoin
bitcoin_repoInfo.csv
16
https://github.com/topics/bootstrap
bootstrap_repoInfo.csv
17
https://github.c

### Document and share your work