In [1]:
from bs4 import BeautifulSoup
import requests 
from googlesearch import search 
from pytrends.request import TrendReq
from urllib.parse import urlparse
import pandas as pd
import numpy as np
import re
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Get the Top links from Google
# query : query string that we want to search for.
# tld : tld stands for top level domain which means we want to search our result on google.com or google.in or some other domain.
# lang : lang stands for language.
# num : Number of results we want.
# start : First result to retrieve.
# stop : Last result to retrieve. Use None to keep searching forever.
# pause : Lapse to wait between HTTP requests. Lapse too short may cause Google to block your IP. Keeping significant lapse will make your program slow but its safe and better option.
# Return : Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever.

def getSite(query):
    for site in search(query, tld="co.in", num=100, stop=100, pause=2): 
        yield site

In [3]:
# Get related quries form google trends
def getRelatedTopics(query):
    pytrend = TrendReq()
    pytrend.build_payload(kw_list=[query])
    related_queries = pytrend.related_queries()
    for result in related_queries.get(query).get('top')['query']:
        yield result

In [4]:
def getValue(text):
    for txt in text:
        yield txt.get_text()

In [5]:
def getUrls(text):
    for txt in text:
        yield txt.get('href')

In [6]:
def getDomain(url):
    # from urlparse import urlparse  # Python 2
    parsed_uri = urlparse(url)
    return parsed_uri.netloc

In [7]:
def getUri(url):
    # from urlparse import urlparse  # Python 2
    parsed_uri = urlparse(url)
    return parsed_uri.path

In [8]:
# Build pipeline

In [9]:
def crawlWebLinks(srchStr):

    # cols {search_str, full_link, domain, URI, res_cd, title, H1, H2, H3, H4, H5, H6, link_tag, img_alt}
    df = pd.DataFrame()

    for respRank,URL in enumerate(getSite(srchStr)):
        print(str(respRank) +  ' - '  +  srchStr , end='\r')
        
        respRow = {}

        # Get Search String
        respRow['srchStr'] = srchStr

        # Get Rank
        respRow['rank'] = respRank+1

        # Get full Link
        respRow['fullLink'] = URL

        # Get response code
        
        try:
            r = requests.get(URL,verify=False) 
        except:
            print("Connection refused by the server..  " + URL)
            print("Let me sleep for 10 seconds")
            print("ZZzzzz...")
            time.sleep(10)
            print("now let me continue...")
        
        
        respStatusCode = r.status_code
        respRow['respCD'] = respStatusCode
        soup = BeautifulSoup(r.content, 'html5lib') 

        # Get title
        text = soup.find_all('title') 
        for respTitle in getValue(text):
            respRow['title'] = respTitle
#             break

        # Get Headings 
        headers = ['h1','h2','h3','h4','h5','h6']

        for hTags in headers:
            respHeaders = []
            text = soup.find_all(hTags) 
            for txt in getValue(text):
                pattern =  r'(\\n)|(\\t)|(\t)|(\s\s)'
                txt = re.sub(pattern, '', txt)
                if txt != ' ':
                    respHeaders.append(txt)
            if respHeaders == []:
                respRow[hTags] = np.nan
            else:
                respRow[hTags] = respHeaders

        # Get anchor tags
        respUrls = []
        text = soup.find_all('a')
        for urls in getUrls(text):
            respUrls.append(urls)
        respRow['a'] = respUrls
        
        # Get img alt tags
        respAlts = []
        text = soup.find_all('img')
        for txt in text:
            if txt.get('alt') != None and txt.get('alt') != '':
                respAlts.append(txt.get('alt'))
        respRow['imgAlt'] = respAlts
        
        
        # Get Meta title
        respMetaTitle = []
        text = soup.find_all('meta', property="og:title")
        for txt in text:
            if txt.get('content') != None and txt.get('content') != '':
                respMetaTitle.append(txt.get('content'))
        respRow['metaTitle'] = respAlts
        
        # Get meta images
        respMetaImg = []
        text = soup.find_all('meta', property="og:image")
        for txt in text:
            if txt.get('content') != None and txt.get('content') != '':
                respMetaTitle.append(txt.get('content'))
        respRow['metaImage'] = respAlts
                
        # Get Domain
        respDomain = getDomain(URL)
        respRow['domain'] = respDomain

        # Get Uri 
        respUri = getUri(URL)    
        respRow['uri'] = respUri

        
    #     dfTemp = pd.DataFrame([respRow])
        df = df.append([respRow])
      
    return df

In [10]:
#Search keyword
srchStr = 'corona'

In [11]:
scrapedDF = crawlWebLinks(srchStr)
scrapedDF.to_csv('scraped_'+srchStr+'.csv')

Connection refused by the server..  https://www.wbhealth.gov.in/pages/corona/bulletin
Let me sleep for 10 seconds
ZZzzzz...
now let me continue...
Connection refused by the server..  https://www.turkishairlines.com/en-us/any-questions/customer-relations/corona-related-request/
Let me sleep for 10 seconds
ZZzzzz...
now let me continue...
99 - corona

In [12]:
scrapedDF.head()

Unnamed: 0,srchStr,rank,fullLink,respCD,title,h1,h2,h3,h4,h5,h6,a,imgAlt,metaTitle,metaImage,domain,uri
0,corona,1,https://indianexpress.com/article/india/covid-...,200,Coronavirus India News Live Updates: COVID-19 ...,[Coronavirus India LIVE Updates: Doubling time...,[Coronavirus (COVID-19) Tracker India News Liv...,,,,,"[https://indianexpress.com/, https://tamil.ind...","[The Indian Express, Live now, coronavirus, co...","[The Indian Express, Live now, coronavirus, co...","[The Indian Express, Live now, coronavirus, co...",indianexpress.com,/article/india/covid-19-coronavirus-india-trac...
0,corona,2,https://www.hindustantimes.com/noida/new-coron...,200,New corona cases spike after lockdown relaxati...,[New corona cases spike after lockdown relaxat...,[A total of 18 new coronavirus cases were repo...,,[partnersites],,,"[https://www.hindustantimes.com/, JavaScript:V...","[scorecardresearch, quora, hindustantimes, Ope...","[scorecardresearch, quora, hindustantimes, Ope...","[scorecardresearch, quora, hindustantimes, Ope...",www.hindustantimes.com,/noida/new-corona-cases-spike-after-lockdown-r...
0,corona,3,https://timesofindia.indiatimes.com/india/coro...,200,Corona crisis revives unity among opposition ...,[Corona crisis revives unity among opposition ...,"[Top Searches:, Coronavirus outbreak, Trending...","[News Home, Editorials, Entertainment, Videos,...",,,[Download],"[https://navbharattimes.indiatimes.com/, https...",[On cam: Pak plane crashes in residential area...,[On cam: Pak plane crashes in residential area...,[On cam: Pak plane crashes in residential area...,timesofindia.indiatimes.com,/india/corona-crisis-revives-unity-among-oppos...
0,corona,4,https://www.mygov.in/covid-19/?cbps=1,200,"#IndiaFightsCorona COVID-19 in India, Corona V...",[ #IndiaFightsCorona COVID-19],"[Main Menu, Sticky Menu, E - Pass Registerfor ...","[COVID-19 Statewise Status, There is a lot of ...",[Latest Notifications],,,"[http://india.gov.in, #section1, javascript:vo...","[Home, Menu, NIC Logo]","[Home, Menu, NIC Logo]","[Home, Menu, NIC Logo]",www.mygov.in,/covid-19/
0,corona,5,https://www.mohfw.gov.in/,200,MoHFW | Home,[Ministry of Healthand Family Welfare],"[COVID-19 INDIA as on : 22 May 2020, 08:00 IST...","[Awareness, FAQs]",,,,"[index.html, #latest-update, #site-advisories,...","[India Flag, Facebook Icon, Twitter Icon, Yout...","[India Flag, Facebook Icon, Twitter Icon, Yout...","[India Flag, Facebook Icon, Twitter Icon, Yout...",www.mohfw.gov.in,/


In [None]:
# Get related searches
for idx,relKwd in enumerate(getRelatedTopics(srchStr)):
    dfT = crawlWebLinks(relKwd)
    dfT.to_csv('scraped_'+relKwd+'.csv')
#     scrapedDF = scrapedDF.append(dfT)
    if idx == 15:
        break

Connection refused by the server..  https://www.ncbi.nlm.nih.gov/books/NBK7782/
Let me sleep for 10 seconds
ZZzzzz...
now let me continue...
Connection refused by the server..  https://academic.oup.com/journals/pages/coronavirus
Let me sleep for 10 seconds
ZZzzzz...
now let me continue...
Connection refused by the server..  https://www.delta.com/us/en/travel-update-center/overview
Let me sleep for 10 seconds
ZZzzzz...
now let me continue...
Connection refused by the server..  https://www.dhl.com/global-en/home/global-news-alerts/global-messages/coronavirus.html
Let me sleep for 10 seconds
ZZzzzz...
now let me continue...
Connection refused by the server..  https://az.gov/government-0
Let me sleep for 10 seconds
ZZzzzz...
now let me continue...
Connection refused by the server..  https://www.virginatlantic.com/covid19
Let me sleep for 10 seconds
ZZzzzz...
now let me continue...
Connection refused by the server..  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6466186/
Let me sleep for 10 

In [None]:
scrapedDF.to_csv('scraped.csv')

In [None]:
scrapedDF.head(3).T

In [12]:
for relKwd in getRelatedTopics(srchStr):
    print(relKwd)

virus corona
coronavirus
corona cases
update corona
corona india
corona news
corona in india
corona deutschland
world corona
corona live
corona symptoms
update virus corona
corona china
corona usa
corona ca
corona virus cases
india corona cases
corona map
corona aktuell
corona virus news
covid 19
corona virus india
corona cases in india
corona virus symptoms
corona uk
