In [57]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os

## First function :first stage scraping of the webpage 

In [5]:
# we will create a dataframe of all the topics on the github links
# Dataframe will contain the TOPIC_NAME , TOPIC_DESCRIPTION , TOPIC_LINK
# first function name is "first_stage_scraper()"

In [59]:
def first_stage_scraper():      # we do not need any parameters inside this function at this stage
    URL_one = 'https://github.com/topics'
    base_url = 'https://github.com'
    response = requests.get(URL_one)        # we are downloading the content of the link at this stage
    if response.status_code != 200:        # here we checked the status code 
        print("There might be something going wrong.")
    document = BeautifulSoup(response.text , 'html.parser')   # parsing the response.text not the response becoz it will be a "response_object"
    topic_mention = document.find_all('p' , {'class':"f3 lh-condensed mb-0 mt-1 Link--primary"})
    desc_mention = document.find_all('p' , {'class': 'f5 color-fg-muted mb-0 mt-1'})
    link_mention = document.find_all('a' ,{'class':'no-underline flex-1 d-flex flex-column'})  # these three give all tags
    
    # create an list where all the topic or titles will be there
    topic_mention_textlst = [topic_mention[i].text for i in range(len(topic_mention))]  # topic_mention is itself a list of topic_tags 
    # create a list where all the description will be there
    desc_mention_textlst = [desc_mention[i].text.replace('\n' , '').strip() for i in range(len(desc_mention))] # desc_mention is list of all desc_tags
    # create a list of all the links of individual topics or titles 
    link_mention_textlst = [base_url+link_mention[i]['href'] for i in range(len(link_mention))]

    return pd.DataFrame({'TOPIC_NAME' : topic_mention_textlst,
                       'DESCRIPTION' : desc_mention_textlst ,
                         'LINKS':link_mention_textlst})
    

In [46]:
first_stage_scraper().head(2)

Unnamed: 0,TOPIC_NAME,DESCRIPTION,LINKS
0,3D,3D refers to the use of three-dimensional grap...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax


## Second function : all_repositories_of _individual_topics_in_a_dataframe

In [42]:
# This function gets inside each URL which is there in the first datarame created
# It collects the username,repo_name,repo_links and star_counts
# convert these datas into the dataframe 
# name of this function is 'all_repos_of_each_topic'

In [61]:
def all_repos_of_each_topic(page_url):  # it takes a parameter of individual links of each topic 
    response = requests.get(page_url)    # getting a response object from request library
    document = BeautifulSoup(response.text , 'html.parser')
    h3_class_mention = document.find_all('h3' , {'class':"f3 color-fg-muted text-normal lh-condensed"}) # this creates a list of all h3 class
    star_tag_mention = document.find_all('span' , {'class':"Counter js-social-count"})   # this creates a list of all span class

    all_repos_intopic_dict = {'USERNAME': [],
                             'REPO_NAME' : [],
                             'STARS' : [],
                             'REPO_LINK':[]}
    for i in range(len(h3_class_mention)):
        repo_data = individual_repo_info(h3_class_mention[i] , star_tag_mention[i])   # individual_repo_info is the third function which is inside here
        all_repos_intopic_dict['USERNAME'].append(repo_data[0])
        all_repos_intopic_dict['REPO_NAME'].append(repo_data[1])
        all_repos_intopic_dict['STARS'].append(repo_data[2])
        all_repos_intopic_dict['REPO_LINK'].append(repo_data[3])

    return pd.DataFrame(all_repos_intopic_dict)

## Third_function : Individual repository complete_info

In [None]:
# This function extracts the username,repo_name,repo_links and star_counts from h3_class_mention and star_tag_mention
# These 2 (h3 and star) are extracted in second function.
# Thus this third function takes two input parameters (h3_class_mention and star__tag_mention)
# Name of this function is "individual_repo_info"

In [63]:
def individual_repo_info(h3_class_mention, star_tag_mention):# input parameter names of function could be anything.It doesnt matter,but while applying,choose the right variables
    a_class_mention = h3_class_mention.find_all('a') 
    username = a_class_mention[0].text.strip()
    repo_name = a_class_mention[1].text.strip()
    base_url = 'https://github.com'
    repo_link = base_url+a_class_mention[1]['href']
    star_counts = parse_star_count(star_tag_mention.text)
    return username , repo_name , star_counts , repo_link

## Fourth function : to convert the star counts to integers

In [44]:
# This will remove the string'k' if available in the star counts 
# convert the remaining character into the float type
# multiply the float number with 1000
# finally convert the star count to integers
# name of this function is parse_star_count

In [65]:
def parse_star_count(star_str):   # It takes input parameter as string which is extracted from the star_tags.text
    star_str = star_str.strip()
    if star_str[-1] == 'k':
        return int(float(star_str.replace('k',''))*1000)
    return star_str

## Fifth function : extracting topic-name and url from the dataframe created from first function

In [None]:
# this function will extract the TOPIC NAME and URL links from the first dataframe 
# it will further scrape those url links through functions which are embedded inside this function
# I will call this function name as "meta_second_stage_scraper"
# There will be another function inside it which will do the actual scrapping ,that will be the sixth function and I am gonna call it "second_stage_scraper"

In [67]:
def meta_second_stage_scraper():  # No input parameter is required as it will first take the dataframe and then extract the NAME and URL
    print("Scraping list of topics from the github/topics...")
    recreate_df = first_stage_scraper()   #first_stage_scraper() creates a df which is stored in variable named "recreate_df"
    for index,row in recreate_df.iterrows():
        print(f"Scraping top repositories for {row['TOPIC_NAME']}.....")
        second_stage_scraper(row['TOPIC_NAME'] , row['LINKS'])

## Sixth function : It takes individual topic_url and scrape it using embedded functions present within this function and also it creates a csv file of the topic from the resulting dataframe.

In [None]:
# name of this function as mentioned in the fifth function notes will be "second__stage_scraper()

In [69]:
def second_stage_scraper(topic_name , topic_url):
    fname = topic_name + '.csv'
    if os.path.exists(fname):
        print(f"The file {fname} already exists.Skipping....")
        return
    df = all_repos_of_each_topic(topic_url)        # this is the third function and it returns a dataframe ,that's why variable 'df'
    df.to_csv(fname , index = None)

## 

In [71]:
meta_second_stage_scraper()

Scraping list of topics from the github/topics...
Scraping top repositories for 3D.....
Scraping top repositories for Ajax.....
Scraping top repositories for Algorithm.....
Scraping top repositories for Amp.....
Scraping top repositories for Android.....
Scraping top repositories for Angular.....
Scraping top repositories for Ansible.....
Scraping top repositories for API.....
Scraping top repositories for Arduino.....
Scraping top repositories for ASP.NET.....
Scraping top repositories for Awesome Lists.....
Scraping top repositories for Amazon Web Services.....
Scraping top repositories for Azure.....
Scraping top repositories for Babel.....
Scraping top repositories for Bash.....
Scraping top repositories for Bitcoin.....
Scraping top repositories for Bootstrap.....
Scraping top repositories for Bot.....
Scraping top repositories for C.....
Scraping top repositories for Chrome.....
Scraping top repositories for Chrome extension.....
Scraping top repositories for Command-line interfa

In [73]:
meta_second_stage_scraper()

Scraping list of topics from the github/topics...
Scraping top repositories for 3D.....
The file 3D.csv already exists.Skipping....
Scraping top repositories for Ajax.....
The file Ajax.csv already exists.Skipping....
Scraping top repositories for Algorithm.....
The file Algorithm.csv already exists.Skipping....
Scraping top repositories for Amp.....
The file Amp.csv already exists.Skipping....
Scraping top repositories for Android.....
The file Android.csv already exists.Skipping....
Scraping top repositories for Angular.....
The file Angular.csv already exists.Skipping....
Scraping top repositories for Ansible.....
The file Ansible.csv already exists.Skipping....
Scraping top repositories for API.....
The file API.csv already exists.Skipping....
Scraping top repositories for Arduino.....
The file Arduino.csv already exists.Skipping....
Scraping top repositories for ASP.NET.....
The file ASP.NET.csv already exists.Skipping....
Scraping top repositories for Awesome Lists.....
The file 

## Rough work below to check some code

In [22]:
first_df = first_stage_scraper()

In [30]:
lst = []
for index,row in first_df.iterrows():
    value1 = row['TOPIC_NAME']
    value2 = row['LINKS']
    lst.append((value1 , value2))
print(lst)

[('3D', 'https://github.com/topics/3d'), ('Ajax', 'https://github.com/topics/ajax'), ('Algorithm', 'https://github.com/topics/algorithm'), ('Amp', 'https://github.com/topics/amphp'), ('Android', 'https://github.com/topics/android'), ('Angular', 'https://github.com/topics/angular'), ('Ansible', 'https://github.com/topics/ansible'), ('API', 'https://github.com/topics/api'), ('Arduino', 'https://github.com/topics/arduino'), ('ASP.NET', 'https://github.com/topics/aspnet'), ('Awesome Lists', 'https://github.com/topics/awesome'), ('Amazon Web Services', 'https://github.com/topics/aws'), ('Azure', 'https://github.com/topics/azure'), ('Babel', 'https://github.com/topics/babel'), ('Bash', 'https://github.com/topics/bash'), ('Bitcoin', 'https://github.com/topics/bitcoin'), ('Bootstrap', 'https://github.com/topics/bootstrap'), ('Bot', 'https://github.com/topics/bot'), ('C', 'https://github.com/topics/c'), ('Chrome', 'https://github.com/topics/chrome'), ('Chrome extension', 'https://github.com/top

## Peeping and getting inside each TOPIC - Investigating username,repo_name,repo_links and star_counts