## Github Scraping project

### Objective of the project:-
- To get the popular topics from github
- Topic description and it's url
- Then from each topic information of top repos are collected.
- Like repo name, repo username, stars and respective urls.

### Popular topics

<img src="topic.png"/>

### Repo Info that will be scraped

<img src="repo_info.png"/>

### Now let's start the process

In [2]:
#let's import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

### below code does the job

In [26]:
#get topic from github page
def get_topic_title(doc):
    topic_title_tags = doc.find_all(['p'], class_="f3 lh-condensed mb-0 mt-1 Link--primary")
    topic_title_tags = [topic.string for topic in topic_title_tags]
    
    return topic_title_tags

#get description for each topic
def get_topic_desc(doc):
    topic_title_desc = doc.find_all(['p'], class_ = "f5 color-fg-muted mb-0 mt-1")
    topic_title_desc = [topic.text.strip() for topic in topic_title_desc]
    
    return topic_title_desc

#get url for each topic
def get_topic_url(doc):
    url_tags = doc.find_all(['p'], class_ = "f5 color-fg-muted mb-0 mt-1")
    topic_url = []
    for i in url_tags:
        topic_url.append("https://github.com"+i.parent["href"])
    return topic_url

# this function scrape the topic
def scrape_topic(url):
#     url = "https://github.com/topics"
    result = requests.get(url)
    if result.status_code != 200:
        raise Exception("Failes to load page {}".format(url))
    else:
        html_doc = BeautifulSoup(result.text,"html.parser")

        topic_info = {
        "title":get_topic_title(html_doc),
        "Description":get_topic_desc(html_doc),
        "URL":get_topic_url(html_doc)
        }

        return pd.DataFrame(topic_info)
    
#convert no of stars into integers 
# Eg:- 69.7k to 69700
def get_int(x):
    if x[-1]=="k":
        num = float(x[:-1]) * 1000
        return int(num)
    else:
        return int(x)

# this is the main function which scrape the repo info    
def get_repo_info(h3_tag, star_tag):
    a_tag = h3_tag.find_all("a")
    username = a_tag[0].text.strip()
    reponame = a_tag[1].text.strip()
    base_url = "https://github.com"
    repo_url = base_url + (a_tag[1]["href"])
    user_id_url = base_url + (a_tag[0]["href"])
    star = star_tag.text.strip()
    star = get_int(star)
    
    return username, reponame, repo_url, user_id_url, star

def get_topic_repo(topic_url):
    #sending request to topic page
    request_page = requests.get(topic_url)
    if request_page.status_code != 200:
        raise Exception("Failes to load page {}".format(topic_url))
    else:
        topic_page = BeautifulSoup(request_page.text, "html.parser")

        #grab h3 tag
        #grabing repo name and user name
        h3_tag = topic_page.find_all(["h3"], class_ = "f3 color-fg-muted text-normal lh-condensed")

        #grab span tag
        #it contains star info
        star_tag = topic_page.find_all(["span"], class_="Counter js-social-count")


        repo_dict = {
        "UserName":[],
        "RepoName":[],
        "RepoUrl":[],
        "UserIDUrl":[],
        "Star":[]
        }

        UserName = []
        RepoName = []
        RepoUrl = []
        UserIDUrl = []
        Star = []

        for (i,j) in zip(h3_tag,star_tag):
            res = get_repo_info(i,j)
            repo_dict["UserName"].append(res[0])
            repo_dict["RepoName"].append(res[1])
            repo_dict["RepoUrl"].append(res[2])
            repo_dict["UserIDUrl"].append(res[3])
            repo_dict["Star"].append(res[4])

    return pd.DataFrame(repo_dict)


def scrap_github():
    url = "https://github.com/topics"
    df = scrape_topic(url)
    
    #creating the target directory
    parent_dir = os.getcwd()
    target_dir = "scraped_data"
    full_path = os.path.join(parent_dir,target_dir)
    if os.path.exists(full_path):
        print("Target directory file already exists")
    else:
        os.mkdir(full_path)
    
    #for each topic scraping github repo
    for index, rows in df.iterrows():
        title = rows['title']
        url = rows["URL"]
        if os.path.exists(full_path + "\{}".format(title) + ".csv"):
            print("Skipping as {} already exists".format(title+".csv"))
        else:
            print("Scraping repo for topic '{}'".format(title))
            df = get_topic_repo(url)
            df.to_csv(full_path + "\{}".format(title) + ".csv", index = None)
    

### To see the result just call the function scrap_github()

In [27]:
scrap_github()

Target directory file already exists
Skipping as 3D.csv already exists
Skipping as Ajax.csv already exists
Skipping as Algorithm.csv already exists
Skipping as Amp.csv already exists
Skipping as Android.csv already exists
Skipping as Angular.csv already exists
Skipping as Ansible.csv already exists
Skipping as API.csv already exists
Skipping as Arduino.csv already exists
Skipping as ASP.NET.csv already exists
Skipping as Atom.csv already exists
Skipping as Awesome Lists.csv already exists
Skipping as Amazon Web Services.csv already exists
Skipping as Azure.csv already exists
Skipping as Babel.csv already exists
Skipping as Bash.csv already exists
Skipping as Bitcoin.csv already exists
Skipping as Bootstrap.csv already exists
Skipping as Bot.csv already exists
Skipping as C.csv already exists
Skipping as Chrome.csv already exists
Skipping as Chrome extension.csv already exists
Skipping as Command line interface.csv already exists
Skipping as Clojure.csv already exists
Skipping as Code q

### We can see that we are getting the desired result