In [115]:
import os

In [116]:
#each website crawled will be saved in different folder
def create_project_dir(directory):
    if not os.path.exists(directory):
        print('Creating directory ' + directory)
        os.makedirs(directory)

# createProjectDirectory("Crawler")

In [117]:
# create queue and crawled files if not created
def create_data_files(project_name, base_url):
    queue = os.path.join(project_name , 'queue.txt')
    crawled = os.path.join(project_name,"crawled.txt")
    if not os.path.isfile(queue):
        write_file(queue, base_url)
    if not os.path.isfile(crawled):
        write_file(crawled, '')


In [118]:
def write_file(path, data):
    with open(path, 'w') as f:
        f.write(data)

In [119]:
# Add data onto an existing file
def append_to_file(path, data):
    with open(path, 'a') as file:
        file.write(data + '\n')

#clear the file 
# Delete the contents of a file
def delete_file_contents(path):
    open(path, 'w').close()

In [120]:
#now chances are that same link are occuring multiple times in a file so...
# we will create a set which read the file one by one and each link into a set
#rt stands for read text...
def file_to_set(file_name):
    results = set()
    with open(file_name, 'rt') as f:
        for line in f:
            results.add(line.replace('\n', ''))
    return results

In [121]:
#converting a set to a file 
def set_to_file(links, file_name):
    with open(file_name,"w") as f:
        for l in sorted(links):
            f.write(l+"\n")

In [122]:
from html.parser import HTMLParser
from urllib import parse


class LinkFinder(HTMLParser):

    def __init__(self, base_url, page_url):
        super().__init__()
        self.base_url = base_url
        self.page_url = page_url
        self.links = set()

    # When we call HTMLParser feed() this function is called when it encounters an opening tag <a>
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for (attribute, value) in attrs:
                if attribute == 'href':
                    url = parse.urljoin(self.base_url, value)
                    self.links.add(url)

    def page_links(self):
        return self.links

    def error(self, message):
        pass

In [123]:
#this block is used for crawling website related to domain name if we do not do this ...
#the crawled will crwal whole internet so to stop this we are using network parser

from urllib.parse import urlparse
#get sub domain name for getting link which is useful

def get_domain_name(url):
    try:
        results = get_sub_domain_name(url).split('.')
        return results[-2]+'.'+results[-1]
    except:
        return ''

def get_sub_domain_name(url):
    try:
        return urlparse(url).netloc
    except:
        return ''
print (get_domain_name('http://www.rknec.edu/'))

rknec.edu


In [124]:
#now lets program the spider for connecting to the real server
from urllib.request import urlopen

#now for performing task of crawling faster we'll create objects which are shared among various spiders...for
#instance there are 6 spiders for extracting the links from the website....
class Spider:
    project_name = ''
    base_url = ''
    domain_name = ''
    queue_file = ''
    crawled_file = ''
    queue = set()
    crawled = set()

    def __init__(self, project_name, base_url, domain_name):
        Spider.project_name = project_name
        Spider.base_url = base_url
        Spider.domain_name = domain_name
        Spider.queue_file = Spider.project_name + '/queue.txt'
        Spider.crawled_file = Spider.project_name + '/crawled.txt'
        self.boot()
        self.crawl_page('First spider', Spider.base_url)

    # Creates directory and files for project on first run and starts the spider
    @staticmethod
    def boot():
        create_project_dir(Spider.project_name)
        create_data_files(Spider.project_name, Spider.base_url)
        Spider.queue = file_to_set(Spider.queue_file)
        Spider.crawled = file_to_set(Spider.crawled_file)

    # Updates user display, fills queue and updates files
    @staticmethod
    def crawl_page(thread_name, page_url):
        if page_url not in Spider.crawled:
            print(thread_name + ' now crawling ' + page_url)
            print('Queue ' + str(len(Spider.queue)) + ' | Crawled  ' + str(len(Spider.crawled)))
            Spider.add_links_to_queue(Spider.gather_links(page_url))
            Spider.queue.remove(page_url)
            Spider.crawled.add(page_url)
            Spider.update_files()

    # Converts raw response data into readable information and checks for proper html formatting
    @staticmethod
    def gather_links(page_url):
        #this method is for taking a page url and then loading the contents of that url
        #the data from the urllib will be in bianry format we need to convert it into string format 
        #so we use a decoder to convert it into string format
        #now chances are that the website doesnot allow the user to crawl the website we need to handle it by trycatch
        html_string = ''
        try:
            response = urlopen(page_url)
            if 'text/html' in response.getheader('Content-Type'):
                html_bytes = response.read()
                html_string = html_bytes.decode("utf-8")
            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception as e:
            print(str(e))
            return set()
        return finder.page_links()

    # Saves queue data to project files
    @staticmethod
    def add_links_to_queue(links):
        for url in links:
            if (url in Spider.queue) or (url in Spider.crawled):
                continue
            if Spider.domain_name != get_domain_name(url):
                continue
            Spider.queue.add(url)

    @staticmethod
    def update_files():
        set_to_file(Spider.queue, Spider.queue_file)
        set_to_file(Spider.crawled, Spider.crawled_file)

In [126]:
#the main program which will call all spiders and various methods

import threading
from queue import Queue

#queue is job and threads are the spider which will do the job from queue
#constants
PROJECT_NAME = 'rcoem'
HOMEPAGE = 'http://www.rknec.edu/'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + "/queue.txt"
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'

NUMBER_OF_THREADS = 8

queue = Queue()
Spider(PROJECT_NAME,HOMEPAGE,DOMAIN_NAME)



Creating directory rcoem
First spider now crawling http://www.rknec.edu/
Queue 1 | Crawled  0


<__main__.Spider at 0x10e07b0b8>

In [1]:
import requests
from bs4 import BeautifulSoup
 
'''
URL of the archive web-page which provides link to
all video lectures. It would have been tiring to
download each video manually.
In this example, we first crawl the webpage to extract
all the links and then download videos.
'''
 
# specify the URL of the archive here
archive_url = "https://www.nseindia.com/corporates/corporateHome.html"
 
def get_video_links():
     
    # create response object
    r = requests.get(archive_url)
     
    # create beautiful-soup object
    soup = BeautifulSoup(r.content,'html5lib')
     
    # find all links on web-page
    links = soup.findAll('a')
 
    # filter the link sending with .mp4
    video_links = [archive_url + link['href'] for link in links if link['href'].endswith('pd')]
 
    return video_links

print (get_video_links())

SyntaxError: invalid syntax (__init__.py, line 53)