In [54]:
import pandas as pd
import re
import requests
from lxml import html
import sys 

In [55]:
class Scrapper:
    
    def __init__(self, source, link_pattern, xpath, df, max_while = 4, max_links = 10):
        self.source = source
        self.link_pattern = link_pattern
        self.xpath = xpath
        self.df = df
        self.max_while = max_while
        self.max_links = max_links
        self.link_storage = []
        self.link_history = []
    
    def scrape(self, seed_link):
        self.link_storage.append(seed_link)
        print('Put link into seed link storage')
        i = 0
        while True:
            links_list_length = len(self.link_storage)
            print('Entered while cycle')
            links = self.link_storage.copy()
            for link in links:
                print('Entered for cycle')
                if link not in self.link_history:
                    seed_page = self.load(link)
                    print('Tried to load: ' + link)
                    self.link_history.append(link)
                    if seed_page:
                        print('Loaded: ' + link)
                        if self.is_needed(link):
                            print('Link is needed: ' + link)
                            self.extract_info(seed_page)
                            print('Info extracted')
                        self.extract_links(seed_page)
                        print('Links extracted')
                if self.df.shape[0] > self.max_links:
                    break
            if links_list_length == len(self.link_storage):
                i += 1
            if self.df.shape[0] > self.max_links or i > self.max_while:
                break
        #    self.scrape(self.link_storage)
        return self.df
        
    
    def extract_links(self, page):
        pattern = self.source + '[a-z0-9\-\_\.\/]+'
        links = re.findall(pattern, page.text)
        for link in links:
            if not link in self.link_storage:
                self.link_storage.append(link)
    
    def load(self, link):
        response = requests.get(link)            
        return response
    
    def extract_info(self, page):
        for path in self.xpath:
            tree = html.fromstring(page.content.decode('UTF-8'))
            result = tree.xpath(path)   
            
            print('find this: ' + path + ' in', end = ' ')
            print (self.xpath)
            
            self.df = self.df.append(pd.DataFrame(result), ignore_index=True) #нужно создать в цикле новую df из results и добавлять ее к общей df
            
            print (path + ' added')
            #return result
    
    def is_needed(self, link):
        return bool(re.search(self.link_pattern, link))

In [56]:
df = pd.DataFrame(columns=['title','author', 'lead'])          #На сайте клоопа расположение, описание и наличие lead часто менялись

scrapper_kloop = Scrapper('https://kloop.kg', 
                          '/blog/',
                          xpath = ['//header/h1/text()',  '//header/div/div/a/text()', '//h4/strong/text()'],                   #сделать список xpath в [], '//header/div/div/a', '//footer/div/ul
                          df=df)


#xpath = ['title' : '//header/h1/text()', 'author' : '//header/div/div/a',   'tag' :'//footer/div/ul'],    
                                 
                                   

In [57]:
df = scrapper_kloop.scrape ('https://kloop.kg')

Put link into seed link storage
Entered while cycle
Entered for cycle
Tried to load: https://kloop.kg
Loaded: https://kloop.kg
Links extracted
Entered while cycle
Entered for cycle
Entered for cycle
Tried to load: https://kloop.kg/xmlrpc.php
Loaded: https://kloop.kg/xmlrpc.php
Links extracted
Entered for cycle
Tried to load: https://kloop.kg/wp-content/uploads/2014/08/kloop_favicon.png
Loaded: https://kloop.kg/wp-content/uploads/2014/08/kloop_favicon.png
Links extracted
Entered for cycle
Tried to load: https://kloop.kg/
Loaded: https://kloop.kg/
Links extracted
Entered for cycle
Tried to load: https://kloop.kg/feed/
Loaded: https://kloop.kg/feed/
Links extracted
Entered for cycle
Tried to load: https://kloop.kg/comments/feed/
Loaded: https://kloop.kg/comments/feed/
Links extracted
Entered for cycle
Tried to load: https://kloop.kg/wp-content/plugins/chained-quiz/css/main.css
Loaded: https://kloop.kg/wp-content/plugins/chained-quiz/css/main.css
Links extracted
Entered for cycle
Tried to 

Tried to load: https://kloop.kg/wp-content/uploads/2018/06/
Entered for cycle
Tried to load: https://kloop.kg/blog/2018/11/05/tsik-lishil-shadieva-deputatskogo-kresla-po-ego-zhe-prosbe-sam-politik-otritsaet-chto-prosil-ob-etom/
Loaded: https://kloop.kg/blog/2018/11/05/tsik-lishil-shadieva-deputatskogo-kresla-po-ego-zhe-prosbe-sam-politik-otritsaet-chto-prosil-ob-etom/
Link is needed: https://kloop.kg/blog/2018/11/05/tsik-lishil-shadieva-deputatskogo-kresla-po-ego-zhe-prosbe-sam-politik-otritsaet-chto-prosil-ob-etom/
find this: //header/h1/text() in ['//header/h1/text()', '//header/div/div/a/text()', '//h4/strong/text()']
//header/h1/text() added
find this: //header/div/div/a/text() in ['//header/h1/text()', '//header/div/div/a/text()', '//h4/strong/text()']
//header/div/div/a/text() added
find this: //h4/strong/text() in ['//header/h1/text()', '//header/div/div/a/text()', '//h4/strong/text()']
//h4/strong/text() added
Info extracted
Links extracted
Entered for cycle
Tried to load: http

In [58]:
df

Unnamed: 0,title,author,lead,0
0,,,,Записывайтесь! Летняя школа журналистики от Бе...
1,,,,Бектур Искендер
2,,,,ТАДЖИКИСТАН: МИЛЛИОНЫ В БРАЧНОЙ КОРЗИНЕ
3,,,,OCCRP
4,,,,ЦИК лишил Шадиева депутатского кресла по его ж...
5,,,,Рустам Халимов
6,,,,Бывший депутат кыргызского парламента Аскарбек...
7,,,,Цифра дня: Уголовное дело против бывшего кыргы...
8,,,,Айдай Иргебаева
9,,,,Одним кадром: Сооронбай Жээнбеков и чемпион UF...


In [None]:
scrapper_kloop.load('https://kloop.kg/blog/2018/11/01/isa-omurkulov-bolshe-ne-chlen-sdpk-no-ostaetsya-liderom-ee-parlamentskoj-fraktsii-chto-proishodit/')

In [117]:
for scrapper_kloop.path in scrapper_kloop.xpath:
    print(scrapper_kloop.path)

//header/div/div/a/text()
//footer/div/ul/text()


In [15]:
scrapper_kloop.df.shape

(0, 3)

In [20]:
scrapper_kloop.df.columns

Index(['title', 'author', 'tag'], dtype='object')

In [154]:
scrapper_kloop.xpath [0]

'//header/h1/text()'

In [42]:
scrapper_kloop.s2

AttributeError: 'Scrapper' object has no attribute 's2'