In [41]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from IES_Downloader import IES_Downloader
from IPython.core.debugger import Tracer

# Lecture 6 - IES Web scraper

by Vítek Macháček

March 27th and 28th, 2019

* Putting it all together
* OOP + Pandas + Requests + BeautifulSoup

## Object-oriented Programming
* Brief reminder of how objects work

### Docstring + Objects + Inheriting + Constructors

In [44]:
class Parent:
    '''
    A very brief and helpful description of the Parent class
    '''
    def __init__(self,arg):
        '''
        Exact and brief description of parent's constructor
        '''
        self.attribute = 'x'
        self.id = arg
        
    def parentMethod(self):
        '''
        Good description of parentMethod
        '''
        pass
    
class Child(Parent):
    '''
    This is what Child is good for
    '''
    def __init__(self,arg,desc):
        '''
        Exact and brief description of Child's contructor
        '''
        super().__init__(arg)
        self.desc = desc
    
    def childMethod(self,arg):
        '''
        Good description of childMethod
        '''
        return len(arg)

    
par = Parent('A parent')
ch = Child(999,'A child')


<__main__.Child object at 0x000001D42DC954E0>


In [5]:
# super: superior - inherit from parent?
# '' ... docstrings - describe the class - what are the inputs/outputs...
# objects: to store variables together
# init: why objects have init function? it initializes the object
?Parent
?par

## Task:
* A parser of IES websites with following features:
    * All info about people from [Internal faculty](http://ies.fsv.cuni.cz/en/node/48), [External lecturers](http://ies.fsv.cuni.cz/en/node/49), [Ph.D. candidates](http://ies.fsv.cuni.cz/en/node/51) and [Administration](http://ies.fsv.cuni.cz/en/node/50)
    * All info about [all](http://ies.fsv.cuni.cz/en/node/109) theses between 1994 and 2019
    * Also all courses! But no list of courses available ...

### Robots.txt

* Is it OK to scrape?
* Guidance for search engines etc.


In [6]:
requests.get('http://ies.fsv.cuni.cz/robots.txt') # doesnt exist -> can scrape
# IES is a static webpage

<Response [404]>

In [7]:
print(requests.get('http://sreality.cz/robots.txt').text)

User-agent: *
Disallow: /advertpdf/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=


User-agent: SeznamBot
Disallow: /advertpdf/
Disallow: /en/
Disallow: /ru/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=

Sitemap: https://www.sreality.cz/site

## Pages

### Find all persons?
[Current faculty](http://ies.fsv.cuni.cz/en/node/48)

In [47]:
def getSoup(link):
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml') #creates BS object from the link - use this object to find ALL tds with class = peopleTableCellaName

# soup = getSoup('http://ies.fsv.cuni.cz/en/node/48')

In [51]:
def getAllLinks(link):
    soup = getSoup(link)
    #Tracer()()
    tds = soup.findAll('td', {'class':'peopleTableCellName'})
    #return tds
    return ['http://ies.fsv.cuni.cz' + td.find('a')['href'] for td in tds] # list comprehension

#list comprehension: [x['...'] for x in list] (do something for x in list)
#dictionary comprehension: {x:x.attribute for x in list}

personLinks = getAllLinks('http://ies.fsv.cuni.cz/en/node/48')
personLinks

['http://ies.fsv.cuni.cz/en/staff/barunik',
 'http://ies.fsv.cuni.cz/en/staff/bauerm',
 'http://ies.fsv.cuni.cz/en/staff/baxajaromir',
 'http://ies.fsv.cuni.cz/en/staff/antosova',
 'http://ies.fsv.cuni.cz/en/staff/cahlik',
 'http://ies.fsv.cuni.cz/en/staff/fcech',
 'http://ies.fsv.cuni.cz/en/staff/cervinka',
 'http://ies.fsv.cuni.cz/en/staff/chytilova',
 'http://ies.fsv.cuni.cz/en/staff/dedek',
 'http://ies.fsv.cuni.cz/en/staff/dolezalova',
 'http://ies.fsv.cuni.cz/en/staff/gersl',
 'http://ies.fsv.cuni.cz/en/staff/gregor',
 'http://ies.fsv.cuni.cz/en/staff/havranek',
 'http://ies.fsv.cuni.cz/en/staff/irsova',
 'http://ies.fsv.cuni.cz/en/staff/hlavacekm',
 'http://ies.fsv.cuni.cz/en/staff/hollmannov',
 'http://ies.fsv.cuni.cz/en/staff/holub',
 'http://ies.fsv.cuni.cz/en/staff/horvath',
 'http://ies.fsv.cuni.cz/en/staff/jakubik',
 'http://ies.fsv.cuni.cz/en/staff/janda',
 'http://ies.fsv.cuni.cz/en/staff/jansky',
 'http://ies.fsv.cuni.cz/en/staff/kemenyova',
 'http://ies.fsv.cuni.cz/en/

## Person's characteristics?

In [55]:
# extract full names and titles
def getName(link):
    soup = getSoup(link)
    return soup.find('h2').text

# try function:
getName('http://ies.fsv.cuni.cz/cs/staff/barunik')
[getName(link) for link in personLinks]

['doc. PhDr. Jozef Baruník Ph.D.',
 'doc. PhDr. Michal Bauer Ph.D.',
 'PhDr. Jaromír Baxa Ph.D.',
 'PhDr. Lucie Bryndová ',
 'doc. Ing. Tomáš Cahlík CSc.',
 'PhDr. František Čech ',
 'RNDr. Michal Červinka Ph.D.',
 'doc. PhDr. Julie Chytilová Ph.D.',
 'prof. Ing. Oldřich Dědek CSc.',
 'doc. PhDr. Ing. Antonie Doležalová Ph.D.',
 'doc. PhDr. Adam Geršl Ph.D.',
 'doc. PhDr. Martin Gregor Ph.D.',
 'doc. PhDr. Tomáš Havránek Ph.D.',
 'doc. PhDr. Zuzana Havránková Ph.D.',
 'PhDr. Michal Hlaváček Ph.D.',
 'Ing. Monika Hollmannová ',
 'doc. Mgr. Tomáš Holub Ph.D.',
 'prof. Roman Horváth Ph.D.',
 'doc. PhDr. Ing. Ing. Petr Jakubík Ph.D. Ph.D.',
 'prof. Ing. Karel Janda M.A., Dr., Ph.D.',
 'doc. Petr Janský Ph.D.',
 'Ing. Irena Kemény ',
 'prof. Ing. Evžen Kočenda M.A., Ph.D., DSc.',
 'prof. Ing. et Ing. Luboš Komárek Ph.D., MSc., MBA',
 'Mgr. Vědunka Kopečná ',
 'doc. PhDr. Ladislav Krištoufek Ph.D.',
 'PhDr. Jiří Kukačka Ph.D.',
 'prof. Ing. Michal Mejstřík CSc.',
 'Mgr. Ing. Matěj Nevrla ',


In [32]:
def getID(link): 
    #http://ies.fsv.cuni.cz/cs/staff/barunik # extract just the last name
    return link.split('/')[-1] 

[getID(link) for link in personLinks]

['barunik',
 'bauerm',
 'baxajaromir',
 'antosova',
 'cahlik',
 'fcech',
 'cervinka',
 'chytilova',
 'dedek',
 'dolezalova',
 'gersl',
 'gregor',
 'havranek',
 'irsova',
 'hlavacekm',
 'hollmannov',
 'holub',
 'horvath',
 'jakubik',
 'janda',
 'jansky',
 'kemenyova',
 'kocenda',
 'komarek',
 'kopecna',
 'kristoufek',
 'kukacka',
 'mejstrik',
 'nevrla',
 'novakji',
 'malirova',
 'paulus',
 'neprasova',
 'gebicka',
 'pinter',
 'scasny',
 'schneider',
 'schwarz',
 'semerak',
 'reckova',
 'gregorovalenka',
 'reichlova',
 'teply',
 'turnovec',
 'vacek',
 'vacha',
 'visek',
 'vosvrda',
 'prochazkova',
 'zacek']

In [92]:
def getOneCharacteristic(link, characteristic):
    soup = getSoup(link)
    strong = soup.find('strong', text = characteristic)
    return strong.next_sibling #strong.next_sibling.text OR strong.next_sibling

getOneCharacteristic('http://ies.fsv.cuni.cz/en/staff/barunik', 'Field of interest:')

#[getOneCharacteristic(link, 'Field of interest:') for link in personLinks]
#[getOneCharacteristic(link, 'Phone:') for link in personLinks]

In [102]:
def getOneCharacteristic(link, characteristic):
    soup = getSoup(link)
    strong = soup.find('strong', text = characteristic)
    return strong.next_sibling #strong.next_sibling.text OR strong.next_sibling

def GetMoreChar(link, characteristics):
    return [GetOneCharacteristic(link, char) for char in characteristics]

getOneCharacteristic('http://ies.fsv.cuni.cz/en/staff/barunik', 'Field of interest:')
#GetMoreChar('http://ies.fsv.cuni.cz/en/staff/barunik', ['Phone:', 'Office:'])
#[GetMoreChar(personLinks, ['Phone:', 'Office:']) for link in personLinks]

AttributeError: 'NavigableString' object has no attribute 'get_text'

In [97]:
def getAnotherCharacteristic(link,characteristic):
    return [GetOneCharacteristic(link, char) for char in characteristic]

#getOneCharacteristic('http://ies.fsv.cuni.cz/en/staff/barunik', 'Field of interest:')
#getAnotherCharacteristic('http://ies.fsv.cuni.cz/en/staff/barunik', ['Phone:', 'Office:'])
# NEVIM! :(

* Let's do an object!

In [40]:
# pevious process took too long bc. we called getSoup twice actually -> make class, use self.---
class Person:
    def __init__(self,link):
        self.soup = getSoup(link)
        self.office = self.getNextSiblingOfStrong('Office:')
        self.phone = self.getNextSiblingOfStrong('Phone:')
        
    def getNextSiblingOfStrong(link, characteristic):
        strong = self.soup.find('strong', text = characteristic)
        return strong.next_sibling

people = [Person(link) for link in personLinks]
[p.phone for p in people]

NameError: name 'self' is not defined

## Thesis characteristics?

In [None]:
dl = IES_Downloader(allowLog=False)
dl.getThesesLinksForCategory('http://ies.fsv.cuni.cz/en/node/270/','Doctoral')
thesesLinks = dl.links['theses']['Doctoral']

In [None]:
class Thesis:
    def __init__(self,link):
        pass

theses = []
for t in tqdm(thesesLinks):
    theses.append(Thesis(t))

* We are lazy programmers!

In [None]:
class Parent:
    pass

class Thesis(Parent):
    pass

class Person(Parent):
    pass

See **IES_Pages.py**

### Last object missing

see **IES_Downloader.py**

# Object Structure