# Top Repository on github

## Pick a website and describe your objective

- Browse through different sites and pick on to scrape. Check the "Project Ideas" section for inspiration.
- Identify the information you'd like to scrape from the site. Decide the format of the output CSV file.
- Summarize your project idea and outline your strategy in a Juptyer notebook. Use the "New" button above.

Here are the steps we'll follow:

- We're going to scrape https://github.com/topics
- We'll get a list of topics. For each topic, we'll get topic title, topic page URL and topic description
- For each topic, we'll get the top 25 repositories in the topic from the topic page
- For each repository, we'll grab the repo name, username, stars and repo URL
- For each topic we'll create a CSV file in the following format:

```
Repo Name,Username,Stars,Repo URL
```

In [None]:
! pip install requests --upgrade --quiet

In [None]:
import requests 


In [None]:
topics_url= 'https://github.com/topics'
response= requests.get(topics_url)

In [None]:
response= requests.get(topics_url)

In [None]:
response.status_code 
## 200 means a goood response code 

In [None]:
len(response.text)

In [None]:
page_contents = response.text

In [None]:
with open('webpage.html' , 'w' ,encoding="utf-8") as f:
    f.write(page_contents)

 ## Use Beautiful Soup to parse and extract the data 

In [None]:
! pip install beautifulsoup4 --upgrade --quiet 

In [None]:
from bs4 import BeautifulSoup

In [None]:
doc = BeautifulSoup(page_contents, 'html.parser')

In [None]:
type(doc)

In [None]:
select_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topics_titles = doc.find_all('p' , {"class":select_class})
len(topics_titles)



In [None]:
topics_titles

In [None]:
select_class_desc = 'f5 color-text-secondary mb-0 mt-1'
topics_desc = doc.find_all('p' , {"class":select_class_desc})
len(topics_desc)

In [None]:
topics_desc[:10]

In [None]:
## topics_titles , topics_desc
select_url = 'd-flex no-underline'
topics_url = doc.find_all('a' , {"class":select_url})
len(topics_url)


In [None]:
topics_url[0]['href']

In [None]:
topic_url2= "https://github.com" + topics_url[2]['href']
print(topic_url2)

In [None]:
topics_titles_list= []

for i in topics_titles:
    topics_titles_list.append(i.text)
    
print(topics_titles_list)

In [None]:
topics_desc_list= []

for i in topics_desc:
    topics_desc_list.append(i.text.strip())
    
    
for i in topics_desc_list:
    print("*--> " + i)

In [None]:
topics_url_list= []
base_url ="https://github.com"
for i in topics_url:
    topics_url_list.append(base_url + i['href'])
    
for i in topics_url_list:
    print("*--> " + i)

In [None]:
!pip install pandas --quiet

In [None]:
import pandas as pd

### Lists with us 
- topics_titles_list
- topics_desc_list
- topics_url_list

In [None]:
dict = {
    'Titles': topics_titles_list, 
    'Description': topics_desc_list, 
    'URL': topics_url_list
} 
    
topics_df = pd.DataFrame(dict)

topics_df

In [None]:
topics_df.shape

## Create a CSV file put of a dataframe 

In [None]:
topics_df.to_csv('topics.csv' , index =None  )

## Getting information out of topic page 

In [None]:
topics_url2 =topics_url_list[0]
topics_url2


In [None]:
response2= requests.get(topics_url2)

In [None]:
response2.status_code

In [None]:
len(response2.text)

In [None]:
topic_doc = BeautifulSoup(response2.text, 'html.parser')


In [None]:
type(topic_doc)

- in this we want the <b>Username repository name and star count</b> 

In [None]:
# this is a parent class which has username class and repository name in it 
parent_class = 'f3 color-text-secondary text-normal lh-condensed'
parent = topic_doc.find_all('h3' , parent_class) 
len(parent)

In [None]:
parent[0]

In [None]:
a_tags = parent[0].find_all('a')
a_tags[0].text.strip()

In [None]:
a_tags[1].text.strip()

In [None]:
a_tags[1]['href']

In [None]:
star_class = 'social-count float-none'
star_count = topic_doc.find_all('a' , {'class': star_class}) 
len(star_count)

In [None]:
star_count[0].text.strip()

In [None]:
def parse_star(stars_str):
    if(stars_str[-1] == 'k'):
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)

print(parse_star(star_count[0].text.strip()))

In [None]:
def get_repo_info(parent , star_tag):
    # this will return all the required info 
    a_tags = a_tags = parent.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star(star_tag.text.strip())
    
    return username, repo_name , repo_url ,stars 

In [None]:
topic_repo_info ={
    'username':[],
    'repo_name':[],
    'stars':[],
    'repo_url':[]
}
for i in range(len(parent)):
    repo_info = get_repo_info(parent[i], star_count[i])
    topic_repo_info['username'].append(repo_info[0])
    topic_repo_info['repo_name'].append(repo_info[1])
    topic_repo_info['repo_url'].append(repo_info[2])
    topic_repo_info['stars'].append(repo_info[3])
    
topic_repo_info

## We are doing this for only one topic we need to do this for all the topics  

In [None]:
topics_repo_df  = pd.DataFrame(topic_repo_info)
topics_repo_df

In [None]:
def parse_star(stars_str):
    if(stars_str[-1] == 'k'):
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)



def get_repo_info(parent , star_tag):
    # this will return all the required info 
    a_tags = parent.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star(star_tag.text.strip())
    
    return username, repo_name , repo_url ,stars 



def get_repo_all(topics_url):
    response= requests.get(topics_url)
    if(response.status_code!= 200):
        raise Exception("Failed to load page")
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    
    parent_class = 'f3 color-text-secondary text-normal lh-condensed'
    parent = topic_doc.find_all('h3' , {'class':parent_class}) 
    
    star_class = 'social-count float-none'
    star_count = topic_doc.find_all('a' , {'class': star_class}) 
    
    topic_repo_info ={
    'username':[],
    'repo_name':[],
    'stars':[],
    'repo_url':[]
    }
    
    for i in range(len(parent)):
        repo_info = get_repo_info(parent[i], star_count[i])
        topic_repo_info['username'].append(repo_info[0])
        topic_repo_info['repo_name'].append(repo_info[1])
        topic_repo_info['repo_url'].append(repo_info[2])
        topic_repo_info['stars'].append(repo_info[3])
        
        
    return pd.DataFrame(topic_repo_info)

In [None]:
topics_url_list[11]

In [None]:
get_repo_all(topics_url_list)

In [None]:
def get_titles(doc):
    topics_titles_list= []
    
    select_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topics_titles = doc.find_all('p' , {"class":select_class})
    
    for i in topics_titles:
        topics_titles_list.append(i.text)
    return topics_titles_list


def get_desc(doc):
    topics_desc_list= []
    
    select_class_desc = 'f5 color-text-secondary mb-0 mt-1'
    topics_desc = doc.find_all('p' , {"class":select_class_desc})
    for i in topics_desc:
        topics_desc_list.append(i.text.strip())
    return topics_desc_list


def get_url(doc):
    select_url = 'd-flex no-underline'
    topics_url = doc.find_all('a' , {"class":select_url})

    topics_url_list= []
    base_url ="https://github.com"
    
    for i in topics_url:
        topics_url_list.append(base_url + i['href'])
    
    return topics_url_list
    

def scrape_topics():
    topics_url= 'https://github.com/topics'
    response= requests.get(topics_url)
    if(response.status_code!= 200):
        raise Exception("Failed to load page")
        
    doc = BeautifulSoup(response.text , 'html.parser')   
    
    topics_dict = {
    "titles" : get_titles(doc),
    "description": get_desc(doc),
    "url" : get_url(doc),
    } 
    
    return pd.DataFrame(topics_dict)




In [None]:
! pip install os --upgrade

In [None]:
# scarpe_topic and scrape_topics are two different functions 
import os
def scrape_topic(topic_url , path): 
    if os.path.exists(path):
        print("skippping a file here ---------")
        return
    
    topic_df = get_repo_all(topic_url)
    topic_df.to_csv(path, index= None)

In [None]:
topics_df =scrape_topics()
topics_df.columns


In [None]:
def scrape_topics_repos():
    print("List of top topics from Github ")
    topics_df =scrape_topics()
    
    os.makedirs('data', exist_ok = True)
    
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['titles']))
        scrape_topic(row["url"], "data/"+ row["titles"]+".csv")

In [None]:
scrape_topics_repos()