In [16]:
import requests
from bs4 import BeautifulSoup
import os
import shutil
import time
import sys
import re
import json
import random
import pandas as pd

In [85]:
class ProxyConstructor(object):
    
    """Constructs Proxies to use in requests."""
    
    def __init__(self, proxy_path='proxy.json'):
        """
        Proxies which can be used for the HTTP-Request.
        
        arguments:
        -----------------
        proxy_path:
            Path where to save the proxies. This is already predefined in same dir.
        
        proxies:
            List of possible proxies.
        """
        self.proxy_path = proxy_path
        if self.proxy_path not in os.listdir():
            self.load_proxies()
            
        self.proxies = self.read_proxies()

    def load_proxies(self):
        """(None)  ---> JSON
        
        Loads proxies from the website : 'https://free-proxy-list.net/'
        
        returns:
        ------------
        JSON
            Saves JSON File with different proxies to local directory
        """
        response =  requests.get("https://proxylist.geonode.com/api/proxy-list?limit=50&page=1&sort_by=lastChecked&sort_type=desc&google=true&speed=medium&protocols=http%2Chttps")
        json_file = json.loads(response.text)
        
        free_proxies = pd.DataFrame(json_file['data'])[['ip', 'port', 'protocols']]
        free_proxies['protocols'] = free_proxies['protocols'].apply(lambda x: ', '.join(x))
        free_proxies = free_proxies[free_proxies['protocols'].isin(['http', 'https'])]
        
        free_proxies['proxy'] = free_proxies['ip'].astype(str) + ':' + free_proxies['port'].astype(str)  
        proxies_dict = dict(proxies=free_proxies['proxy'].to_list())        
        
        with open(self.proxy_path, 'w') as json_file:
            json.dump(proxies_dict, json_file)
        
        print(f'{self.proxy_path} created in directory.')
        
    def read_proxies(self):
        """(None) ---> list
        
        This method reads proxies from json file.
        
        returns:
        ------------
        json_file['proxies']:
            list of proxies
            
        """
        with open(self.proxy_path, 'r') as json_file:
            json_file = json.load(json_file)
            
        return json_file['proxies']
        
        
    def get_proxy(self):
        """(None) ---> dict
        
        Returns a random proxy from the list defined as class attribute.
        
        returns:
        --------------
        random_proxy:
            a randomly picked proxy from a list of possible proxies.
        """
        proxy = random.choice(self.proxies)
        
        return {'http':proxy, 'https':proxy}
        
        

In [55]:
response =  requests.get("https://proxylist.geonode.com/api/proxy-list?limit=50&page=1&sort_by=lastChecked&sort_type=desc&google=true&speed=medium&protocols=http%2Chttps")
json_file = json.loads(response.text)

In [69]:
free_proxies = pd.DataFrame(json_file['data'])[['ip', 'port', 'protocols']]
free_proxies['protocols'] = free_proxies['protocols'].apply(lambda x: ', '.join(x))

In [87]:
os.listdir()

['.ipynb_checkpoints',
 'manga_all_chapters.url',
 'manga_scraper.ipynb',
 'OnePiece',
 'one_piece_scraper.py',
 'proxy.json']

In [78]:
free_proxies[free_proxies['protocols'].isin(['http', 'https'])]

Unnamed: 0,ip,port,protocols,proxy
0,93.170.200.180,8080,http,93.170.200.180:8080
1,165.29.108.250,3128,http,165.29.108.250:3128
2,5.16.0.243,1256,http,5.16.0.243:1256
3,78.47.104.35,3128,https,78.47.104.35:3128
4,190.95.214.178,8080,http,190.95.214.178:8080
5,147.135.255.62,8139,https,147.135.255.62:8139


In [73]:
free_proxies['proxy'] = free_proxies['ip'].astype(str) + ':' + free_proxies['port'].astype(str)

In [74]:
free_proxies

Unnamed: 0,ip,port,protocols,proxy
0,93.170.200.180,8080,http,93.170.200.180:8080
1,165.29.108.250,3128,http,165.29.108.250:3128
2,5.16.0.243,1256,http,5.16.0.243:1256
3,78.47.104.35,3128,https,78.47.104.35:3128
4,190.95.214.178,8080,http,190.95.214.178:8080
5,147.135.255.62,8139,https,147.135.255.62:8139


In [86]:
proxy_construct = ProxyConstructor()
print(proxy_construct.get_proxy())

proxy.json created in directory.
{'http': '147.135.255.62:8139', 'https': '147.135.255.62:8139'}


In [10]:
proxy = proxy_construct.get_proxy()
proxy

{'http': '103.115.14.200:80', 'https': '103.115.14.200:80'}

In [88]:
requests.get('http://www.google.com', headers=header, proxies=proxy_construct.get_proxy())

<Response [200]>

In [9]:
proxies = []
header = UserAgentConstructor().get_user_agent()
header = {'User_Agent': header}

response = requests.get('http://free-proxy.cz/en/proxylist/country/all/http/ping/all/2', headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
soup_ = soup.find_all('tr')

for item in soup_[1:]:
    print(item)
    item = item.find_all('td')
    """ip_address, port = item[0].text, item[1].text
        proxy = ':'.join([ip_address, port])
        proxies.append(proxy)
"""
proxies_dict = dict(proxies=proxies)

print(proxies_dict)

<tr><th><label>Protocol</label></th><td><input checked="checked" id="frmsearchFilter-protocol-0" name="protocol" type="radio" value="all"/><label for="frmsearchFilter-protocol-0">All</label><input id="frmsearchFilter-protocol-1" name="protocol" type="radio" value="http"/><label for="frmsearchFilter-protocol-1">HTTP</label><input id="frmsearchFilter-protocol-2" name="protocol" type="radio" value="https"/><label for="frmsearchFilter-protocol-2">HTTPS</label><input id="frmsearchFilter-protocol-3" name="protocol" type="radio" value="socks"/><label for="frmsearchFilter-protocol-3">Socks 4/5</label><input id="frmsearchFilter-protocol-4" name="protocol" type="radio" value="socks4"/><label for="frmsearchFilter-protocol-4">Socks 4</label><input id="frmsearchFilter-protocol-5" name="protocol" type="radio" value="socks5"/><label for="frmsearchFilter-protocol-5">Socks 5</label></td></tr>
<tr><th><label>Anonymity</label></th><td><input id="frmsearchFilter-anonymity-0" name="anonymity" type="radio" 

In [None]:
print(response)

In [7]:
class UserAgentConstructor(object):
    
    """Constructs User Agents to use as headers in requests."""
    
    def __init__(self, user_agent_path='user_agents.json'):
        """
        User Agents which can be used for the HTTP-Request headers.
        
        arguments:
        -----------------
        user_agent_path:
            Path where to save the user_agents.json. This is already predefined in same dir.
        
        user_agents:
            List of possible user agents.
            
        """
        self.user_agent_path = user_agent_path
        if user_agent_path not in os.listdir():
            self.load_user_agents()
        self.user_agents = self.read_user_agents()        
        
    def load_user_agents(self):
        """(None)  ---> JSON
        
        Loads user agents from the website : 'https://deviceatlas.com/blog/list-of-user-agent-strings'
        
        returns:
        ------------
        JSON
            Saves JSON File with different user agents to local directory
        """
        response = requests.get('https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/')
        soup = BeautifulSoup(response.text, 'html.parser')
        parsed_user_agents = dict(user_agents=[i.text for i in soup.find_all('td')])
        
        with open(self.user_agent_path, 'w') as json_file:
            json.dump(parsed_user_agents, json_file)
        
        print('user_agents.json created in directory.')

        
    def read_user_agents(self):
        """(None) ---> list
        
        This function returns a user agent from the class.
        
        returns:
        ------------
        json_file['user_agents']:
            list of user_agents
            
        """
        with open(self.user_agent_path, 'r') as json_file:
            json_file = json.load(json_file)
            
        return json_file['user_agents']
            
        
    def get_user_agent(self):
        """(None) ---> str
        
        Returns a random user agent from the list defined as class attribute.
        
        returns:
        --------------
        random_user_agent:
            a randomly picked user agent from a list of possible user agents.
        """
        return random.choice(self.user_agents)
        

In [15]:
headers = UserAgentConstructor()
headers.get_user_agent()

'Mozilla/5.0 (Linux; Android 4.4.3; KFTHWI Build/KTU84M) AppleWebKit/537.36 (KHTML, like Gecko) Silk/47.1.79 like Chrome/47.0.2526.80 Safari/537.36'

In [188]:
class OnePieceMangaScraper(HTTPHeaders):
    """
    This class implements a webscraper to collect One-Piece Manga and saves it locally.

    """

    def __init__(self):
        """

        This class scrapes One-Piece Manga Chapters from the webpage: "https://onepiece-manga-online.net/"

        arguments:
        -------------------
        headers:
            Headers used for the requests. User Agend included only, so that there is not just
            a headless browser making a call to the webpages. If there is a problem with the given header.
            This can be adapted.

        base_url:
            URL to make calls to. This class only works with this URL!

        response_code:
            Response Code to check if a call to the website is even possible.
            This attribute will be created automatically at the initialization of the class.
            If the response code is not equal to <200> the programm will stop abrupt.

        chapter_dict:
            This attribute will be created automatically at initializaiton fo the class.
            It's a dictionary of URL containing the URL's the chapters of the Manga.

        """
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        self.base_url = 'https://onepiece-manga-online.net/'
        
        self.response_code = requests.get(self.base_url)
        if self.response_code.status_code != 200:
            raise Exception('Something went wrong with the webpage connection. HTTP-Code ' + str(self.response_code))
        print('HTTP-Response Code : ' + str(self.response_code))
        self.chapter_dict = self._create_chapter_dict()

    def _create_chapter_dict(self):
        """(None) ---> (dict)

        Scrapes all the chatpers from https://onepiece-manga-online.net/ and creates a dictinoary with
        all chapters and the depending hyperlinks to their chapter.

        returns:
        ---------------
        chapter_urls:
            Dictionary with all chapters on the webpage and the depending urls.

        """
        latest_chapter = self.get_latest_chapter()
        chapter_urls = {}

        response = requests.get(self.base_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        soup_ = soup.find_all('li', attrs={'class': 'widget ceo_latest_comics_widget'})

        for i in soup_[0].find_all('a'):
            url = i['href']
            url = url.split('/')[-2]
            chapter = (re.findall(pattern=r'\d+', string=url)[0])
            if int(chapter) > latest_chapter:
                continue
            else:
                chapter_urls[int(chapter)] = i['href']

        return chapter_urls

    def scrape_chapter(self, chapter_no):
        """(int) ---> (None)

        Scrapes a chapter by given chapter-ID and saves it locally in OnePiece/Chapter_{chapter_no}/Page_...

        The scraper gets the date from source: https://onepiece-manga-online.net/.
        This functino makes use of further functions within the class (_extract_images, _save_images)

        params:
        ---------------
        chapter_no:
            Number of chapter which should be scraped

        returns:
        ---------------
        None
            Saves image in a local folder.

        """
        chapter_url = self.chapter_dict[chapter_no]
        try:
            response = requests.get(chapter_url, stream=True, headers=self.headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            soup_ = soup.find_all('meta', attrs=dict(property="og:image"))

            page_content_ = []
            for i in soup_:
                page_content_.append(i['content'])
        except:
            print(f'Exception occured at scraping Chapter {chapter_no}')

        page_content_ = dict(enumerate(page_content_[1:]))
        self._extract_images(page_content=page_content_, chapter_no=chapter_no)
        sys.stdout.write(f'\n - Chapter {chapter_no} saved. -')

    def scrape_all_chapters(self):
        """(None) ---> (None)

        Scrapes all chapters from the webpage: https://onepiece-manga-online.net/ and saves it
        to a local folder.

        returns:
        ---------------
        None
            Saves image in a local folder.

        """
        latest_chapter = self.get_latest_chapter()
        i = 1
        while i < latest_chapter:
            sys.stdout.flush()
            sys.stdout.write(f'\r ---- Currently Scraping Chapter {i} of {latest_chapter} ----')
            self.scrape_chapter(chapter_no=i)
            time.sleep(1)
            i += 1

    def get_latest_chapter(self):
        """() ---> (int)

        Gets the number of the latest chapter
        """
        response = requests.get('http://onepiece-tube.com/kapitel-mangaliste#oben')
        soup = BeautifulSoup(response.text, 'html.parser')
        table_div = soup.find_all('div', attrs={'class': 'sagatable'})
        all_rows = table_div[0].find_all('tr')
        latest_chapter = all_rows[1].find('td').text

        return int(latest_chapter)

    def _extract_images(self, page_content, chapter_no):
        """(dict, int) ---> None

        Gets the content of the image source URL's and delivers the image content to the next funciton
        which saves the page image in a local folder

        params:
        ---------------
        page_content:
            The URL's to the depending chapter pages images.

        chapter_no:
            The number of the chapter which should be scraped.
        """
        for page, url in page_content.items():
            response = requests.get(url=url, stream=True, headers=self.headers)
            OnePieceMangaScraper._save_image(image_ressources=response, chapter_no=chapter_no,
                                             page_number=page)

    @staticmethod
    def _save_image(image_ressources, chapter_no, page_number):
        """(str, int, int) ---> (None)

        Saves a given image ressources in a local folder. Therefore a new folder OnePiece will be
        created and the chapters with their respective content saved in it.

        params:
        ---------------
        image_ressources:
            Ressources of the image, which will be decoded in the next step

        chapter_no:
            Number of the chapter which should be scraped.

        page_number:
            Number of the depending page.
        """
        try:
            if 'OnePiece' not in os.listdir():
                os.mkdir('OnePiece')
            os.mkdir('OnePiece/' + 'Chapter_' + str(chapter_no))
        except:
            pass

        if image_ressources.status_code == 200:
            with open('OnePiece/Chapter_' + str(chapter_no) + '/Page_' + str(page_number + 1) + '.png', 'wb') as f:
                image_ressources.raw.decode_content = True
                shutil.copyfileobj(image_ressources.raw, f)
                f.close()

In [186]:
mangascraper = OnePieceMangaScraper()

HTTP-Response Code : <Response [200]>


In [187]:
mangascraper.scrape_all_chapters()

 ---- Currently Scraping Chapter 1 of 1018 ---- - Chapter 1 saved. -
 ---- Currently Scraping Chapter 2 of 1018 ---- - Chapter 2 saved. -
 ---- Currently Scraping Chapter 3 of 1018 ---- - Chapter 3 saved. -
 ---- Currently Scraping Chapter 4 of 1018 ---- - Chapter 4 saved. -
 ---- Currently Scraping Chapter 5 of 1018 ---- - Chapter 5 saved. -
 ---- Currently Scraping Chapter 6 of 1018 ---- - Chapter 6 saved. -
 ---- Currently Scraping Chapter 7 of 1018 ---- - Chapter 7 saved. -
 ---- Currently Scraping Chapter 8 of 1018 ---- - Chapter 8 saved. -
 ---- Currently Scraping Chapter 9 of 1018 ---- - Chapter 9 saved. -
 ---- Currently Scraping Chapter 10 of 1018 ---- - Chapter 10 saved. -
 ---- Currently Scraping Chapter 11 of 1018 ---- - Chapter 11 saved. -
 ---- Currently Scraping Chapter 12 of 1018 ---- - Chapter 12 saved. -
 ---- Currently Scraping Chapter 13 of 1018 ---- - Chapter 13 saved. -
 ---- Currently Scraping Chapter 14 of 1018 ---- - Chapter 14 saved. -
 ---- Currently Scraping

KeyboardInterrupt: 