## Github Topics Web Scraping

### Libraries used:
*   Beautiful Soup
*   requests
*   Pandas



### Scraping Process


*   Scraping https://github.com/topics, https://github.com/python and https://github.com/javaScript
*   Using requests module to get the page details
*   Finding related tags or classes and creating functions
*   Use beautful Soup to fetch data with the help of classes.
*   Finding topics details with url, name and description
*   For each repo fetching title, handler, stars and url
*   Creating data frames and saving all data into the csv files



In [None]:
import requests

In [None]:
topics_url = 'https://github.com/topics'

In [None]:
response = requests.get(topics_url)

In [None]:
response.status_code

200

In [None]:
page_content = response.text

In [None]:
with open('webpage.html', 'w', encoding='utf-8') as f:
    f.write(page_content)

In [None]:
# by using ! we directly install library on this jupyter notebook
!pip install beautifulsoup4 --upgrade

Requirement already up-to-date: beautifulsoup4 in c:\users\welcome1\anaconda3\lib\site-packages (4.9.3)


In [None]:
from bs4 import BeautifulSoup

In [None]:
doc = BeautifulSoup(page_content, 'html.parser')

In [None]:
selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
topic_title_tags = doc.find_all('p', {'class':selection_class})
len(topic_title_tags)

30

In [None]:
topic_titles = []
for tag in topic_title_tags:
    topic_titles.append(tag.text)
    
print(topic_titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


In [None]:
desc_selector = "f5 color-text-secondary mb-0 mt-1"
topics_desc_tags = doc.find_all('p', {'class': desc_selector})
len(topics_desc_tags)

30

In [None]:
topics_description = []

for tag in topics_desc_tags:
    topics_description.append(tag.text.strip())
    
print(topics_description[0])

3D modeling is the process of virtually developing the surface and structure of a 3D object.


In [None]:
topics_link_tags = doc.find_all('a',{"class":"d-flex no-underline"})

In [None]:
base_url = "https://github.com"
topic_urls = []

for tag in topics_link_tags:
    topic_urls.append(base_url + tag['href'])

topic_urls

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android',
 'https://github.com/topics/angular',
 'https://github.com/topics/ansible',
 'https://github.com/topics/api',
 'https://github.com/topics/arduino',
 'https://github.com/topics/aspnet',
 'https://github.com/topics/atom',
 'https://github.com/topics/awesome',
 'https://github.com/topics/aws',
 'https://github.com/topics/azure',
 'https://github.com/topics/babel',
 'https://github.com/topics/bash',
 'https://github.com/topics/bitcoin',
 'https://github.com/topics/bootstrap',
 'https://github.com/topics/bot',
 'https://github.com/topics/c',
 'https://github.com/topics/chrome',
 'https://github.com/topics/chrome-extension',
 'https://github.com/topics/cli',
 'https://github.com/topics/clojure',
 'https://github.com/topics/code-quality',
 'https://github.com/topics/code-review',
 'https://github.com/topics/compil

## Creating CSV file

In [None]:
import pandas as pd

In [None]:
topics_dict=    {
        'Title': topic_titles,
        'Description': topics_description,
        'URLs':topic_urls
    }


In [None]:
topic_df = pd.DataFrame(topics_dict)
topic_df.to_csv("githubTopics.csv", index=None)

## Fetching information from topic links

In [None]:
topic_page_url = topic_urls[0]

In [None]:
topic_page_url

'https://github.com/topics/3d'

In [None]:
response = requests.get(topic_page_url)

In [None]:
response.status_code

200

In [None]:
len(response.text)

612038

In [None]:
topic_doc = BeautifulSoup(response.text, 'html.parser')

In [None]:
topic_content = response.text

In [None]:
with open('topicswebpage.html', 'w', encoding='utf-8') as f:
    f.write(topic_content)

In [None]:
h1_selection_class="f3 color-text-secondary text-normal lh-condensed"
repo_tags= topic_doc.find_all('h1', {'class':"f3 color-text-secondary text-normal lh-condensed"})

In [None]:
len(repo_tags)

30

In [None]:
a_tags = repo_tags[0].find_all('a')

In [None]:
a_tags[0].text.strip()

'mrdoob'

In [None]:
a_tags[1].text.strip()

'three.js'

In [None]:
repo_url = base_url + a_tags[1]['href']
repo_url

'https://github.com/mrdoob/three.js'

In [None]:
star_tags = topic_doc.find_all('a',{'class': 'social-count float-none'})

In [None]:
len(star_tags)

30

In [None]:
star_tags[0].text.strip()

'71.2k'

In [None]:
def parse_star_count(stars_str):
    stars_str = stars_str.strip()
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)

In [None]:
parse_star_count(star_tags[0].text.strip())

71200

In [None]:
def get_repo_info(h1_tag, star_tag):
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, repo_url, stars

In [None]:
get_repo_info(repo_tags[0], star_tags[0])

('mrdoob', 'three.js', 'https://github.com/mrdoob/three.js', 71200)

### Combined all above code togather and creating usable funtions

Write a single function to:
1. Get the topics list form main topic page
2. Get the list of the top repos from the individual topic page
3. Create a CSV file for topics.

In [None]:
def get_topic_titles(doc):
    selection_class = "f3 lh-condensed mb-0 mt-1 Link--primary"
    topic_title_tags = doc.find_all('p', {'class':selection_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles



def get_topic_descs(doc):
    desc_selector = "f5 color-text-secondary mb-0 mt-1"
    topics_desc_tags = doc.find_all('p', {'class': desc_selector})
    topics_description = []
    for tag in topics_desc_tags:
        topics_description.append(tag.text.strip())
    return topics_description
    

def get_topic_urls(doc):
    topics_link_tags = doc.find_all('a',{"class":"d-flex no-underline"})
    base_url = "https://github.com"
    topic_urls = []
    for tag in topics_link_tags:
        topic_urls.append(base_url + tag['href'])

    return topic_urls
    
    


def scrape_topics_repos():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception(f'Failed to load page {topics_url}')
    doc = BeautifulSoup(response.text, 'html.parser')
    topics_dict={
        'Title': get_topic_titles(doc),
        'Description': get_topic_descs(doc),
        'URLs': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)

In [None]:
scrape_topics_repos().head()

Unnamed: 0,Title,Description,URLs
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android


In [None]:
def get_topic_page(topic_url):
    response = requests.get(topic_url)
    # confirming response
    if response.status_code != 200:
        raise Exception(f'Failed to load page {topic_url}')
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc    


def get_repo_info(h1_tag, star_tag):
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, repo_url, stars


def get_topic_repos(topic_doc):
    # getting repo tags
    repo_tags= topic_doc.find_all('h1', {'class':"f3 color-text-secondary text-normal lh-condensed"})
    # getting star tags
    star_tags = topic_doc.find_all('a',{'class': 'social-count float-none'})
    
    topic_repos_dict = {
    'username': [],
    'repo_name':[],
    'stars': [],
    'repo_url': []
}
    
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[3])
        topic_repos_dict['repo_url'].append(repo_info[2])
        
    return pd.DataFrame(topic_repos_dict)


def scrape_topic(topic_url, topic_name):
    fname = topic_name + '.csv'
    import os
    if os.path.exists(fname):
        print(f"The file {fname} already exists!")
        return 
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(fname, index=None)


In [None]:
doc = get_topic_page('https://github.com/topics/3d')

In [None]:
def scrape_topics_repos_df():
    topics_df = scrape_topics_repos()
    for index, row in topics_df.iterrows():
        print('Scraping top repos for "{}"'.format(row['Title']))
        scrape_topic(row['URLs'], 'data/{}'.format(row['Title']))
    

In [None]:
scrape_topics_repos_df()

Scraping top repos for "3D"
The file data/3D.csv already exists!
Scraping top repos for "Ajax"
The file data/Ajax.csv already exists!
Scraping top repos for "Algorithm"
The file data/Algorithm.csv already exists!
Scraping top repos for "Amp"
The file data/Amp.csv already exists!
Scraping top repos for "Android"
The file data/Android.csv already exists!
Scraping top repos for "Angular"
The file data/Angular.csv already exists!
Scraping top repos for "Ansible"
The file data/Ansible.csv already exists!
Scraping top repos for "API"
The file data/API.csv already exists!
Scraping top repos for "Arduino"
The file data/Arduino.csv already exists!
Scraping top repos for "ASP.NET"
The file data/ASP.NET.csv already exists!
Scraping top repos for "Atom"
The file data/Atom.csv already exists!
Scraping top repos for "Awesome Lists"
The file data/Awesome Lists.csv already exists!
Scraping top repos for "Amazon Web Services"
The file data/Amazon Web Services.csv already exists!
Scraping top repos for 

## Scraping Python repos

In [None]:
python_repo_url = "https://github.com/topics/python"
python_topic_name = 'pythonRepos'

In [None]:
response = requests.get(python_repo_url)

In [None]:
response.status_code

200

In [None]:
python_page_content = response.text

In [None]:
python_doc = BeautifulSoup(python_page_content, 'html.parser')

In [None]:
def get_python_page(python_repo_url):
    response = requests.get(python_repo_url)
    # confirming response
    if response.status_code != 200:
        raise Exception(f'Failed to load page {python_repo_url}')
    python_doc = BeautifulSoup(response.text, 'html.parser')
    return python_doc    


def get_pythonrepo_info(h1_tag, star_tag):
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, repo_url, stars


def get_python_repos(python_doc):
    # getting repo tags
    repo_tags= python_doc.find_all('h1', {'class':"f3 color-text-secondary text-normal lh-condensed"})
    # getting star tags
    star_tags = python_doc.find_all('a',{'class': 'social-count float-none'})
    
    topic_repos_dict = {
    'username': [],
    'repo_name':[],
    'stars': [],
    'repo_url': []
}
    
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[3])
        topic_repos_dict['repo_url'].append(repo_info[2])
        
    return pd.DataFrame(topic_repos_dict)


def scrape_python_repo(python_repo_url, python_topic_name):
    fname = python_topic_name + '.csv'
    import os
    if os.path.exists(fname):
        print(f"The file {fname} already exists!")
        return 
    python_df = get_python_repos(get_python_page(python_repo_url))
    python_df.to_csv(fname, index=None)


In [None]:
scrape_python_repo(python_repo_url, python_topic_name)

### JavaScript repos

In [None]:
javascript_repos_url = "https://github.com/topics/javascript"
topic_name = "javaScriptRepos"

In [None]:
response = requests.get(javascript_repos_url)
response.status_code

200

In [None]:
javascript_page_content = response.text
javascript_doc = BeautifulSoup(javascript_page_content, 'html.parser')

In [None]:
def get_javascript_page(javascript_repos_url):
    response = requests.get(javascript_repos_url)
    # confirming response
    if response.status_code != 200:
        raise Exception(f'Failed to load page {javascript_repos_urll}')
    javascript_doc = BeautifulSoup(response.text, 'html.parser')
    return javascript_doc    


def get_javascriptrepo_info(h1_tag, star_tag):
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, repo_url, stars


def get_javascript_repos(javascript_doc):
    # getting repo tags
    repo_tags= javascript_doc.find_all('h1', {'class':"f3 color-text-secondary text-normal lh-condensed"})
    # getting star tags
    star_tags = javascript_doc.find_all('a',{'class': 'social-count float-none'})
    
    topic_repos_dict = {
    'username': [],
    'repo_name':[],
    'stars': [],
    'repo_url': []
}
    
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[3])
        topic_repos_dict['repo_url'].append(repo_info[2])
        
    return pd.DataFrame(topic_repos_dict)


def scrape_javascript_repo(javascript_repos_url, topic_name):
    fname = topic_name + '.csv'
    import os
    if os.path.exists(fname):
        print(f"The file {fname} already exists!")
        return 
    javascript_df = get_javascript_repos(get_javascript_page(javascript_repos_url))
    javascript_df.to_csv(fname, index=None)


In [None]:
scrape_javascript_repo(javascript_repos_url, topic_name)