In [1]:
%run common.ipynb

In [48]:
import matplotlib.pyplot as plt
plt.ion()

In [2]:
import xml.etree.ElementTree as ET
import urllib

In [3]:
import sqlite3
import collections


con = sqlite3.connect('programs.db')
okpdtr_to_progs = collections.defaultdict(set)
for okpdtr, program in tqdm.tqdm(con.execute('select okpdtr.okpdtr, p.program_code from programs p left join program_okpdtr okpdtr on p.program_code=okpdtr.program')):
    okpdtr_to_progs[okpdtr].add(program)

con.close()

4817it [00:00, 401581.41it/s]


In [4]:
import pandas as pd

programs = pd.read_csv('programs.csv')
program_ids = sorted(programs['program_code'].unique())

In [5]:
regions = requests.get('https://opendata.trudvsem.ru/7710538364-regions/regions.xml').content
regions = ET.fromstring(regions)
regions = regions.findall('region')
regions = ((r.find('code'), r.find('name')) for r in regions)
regions = ((code.text, name.text) for (code, name) in regions if None not in (code, name))
regions = dict(regions)
regions

{'7200000000000': 'Тюменская область',
 '6800000000000': 'Тамбовская область',
 '3300000000000': 'Владимирская область',
 '9200000000000': 'г. Севастополь',
 '5900000000000': 'Пермский край',
 '8900000000000': 'Ямало-Ненецкий автономный округ',
 '5800000000000': 'Пензенская область',
 '1100000000000': 'Республика Коми',
 '2600000000000': 'Ставропольский край',
 '1200000000000': 'Республика Марий Эл',
 '7900000000000': 'Еврейская автономная область',
 '0900000000000': 'Карачаево-Черкесская Республика',
 '3200000000000': 'Брянская область',
 '0300000000000': 'Республика Бурятия',
 '9900000000000': 'г. Байконур',
 '2900000000000': 'Архангельская область',
 '4100000000000': 'Камчатский край',
 '7800000000000': 'г. Санкт-Петербург',
 '2100000000000': 'Чувашская Республика',
 '7600000000000': 'Ярославская область',
 '7100000000000': 'Тульская область',
 '4800000000000': 'Липецкая область',
 '5500000000000': 'Омская область',
 '4200000000000': 'Кемеровская область',
 '4600000000000': 'Курская

In [6]:
def count_iterations_online(iterator, name=None):
    msg = name + ': {:10}' if name else '{:10}'
    for i, x in enumerate(iterator):
        print(msg.format(i))
        yield x

In [7]:
def get_vacancies():
    evts = ET.iterparse('trudvsem/jobs.xml', events=('start', 'end'))
    root = None
    for evt, elem in evts:
        if elem.tag == 'vacancies':
            root = elem
        if evt == 'end' and elem.tag == 'vacancy':
            yield elem
            elem.clear()
            root.clear()

@attr.s
class XmlVacancy:
    xml = attr.ib()
    okpdtrs = attr.ib()

def url_to_okpdtr(url):
    return urllib.parse.urlparse(url.attrib['resource']).fragment

def get_linkable_vacancies():            
    profs = get_vacancies()
    profs = ((v, v.findall('profession'))
             for v in get_vacancies())
    profs = ((elt,
              (map(url_to_okpdtr, urls)))
             for (elt, urls) in profs
             if len(urls) > 0)
    profs = ((elt,
              list(filter(okpdtr_to_progs.__contains__, okpdtrs)))
             for (elt, okpdtrs) in profs)
    profs = (XmlVacancy(elt, okpdtrs)
             for (elt, okpdtrs) in profs
             if len(okpdtrs) > 0)
    
    # profs = count_iterations(profs, 'parsed')
    return profs
    
profs = get_linkable_vacancies()
profs = itertools.islice(profs, 2)
for p in profs:
    print(p)

XmlVacancy(xml=<Element 'vacancy' at 0x7fb2608b6368>, okpdtrs=['271421'])
XmlVacancy(xml=<Element 'vacancy' at 0x7fb260734278>, okpdtrs=['272443'])


In [8]:
%%time


@attr.s
class RegionVacancy:
    title = attr.ib()
    regions = attr.ib()
    okpdtrs = attr.ib()
    
    def valid(self):
        return (self.title is not None
                and self.regions is not None
                and len(self.regions) > 0
                and self.okpdtrs is not None
                and len(self.okpdtrs) > 0)
    
    
def get_vacancies_regionwise():
    vacs = get_linkable_vacancies()
    vacs = (RegionVacancy(v.xml.find('title'), v.xml.findall('region'), v.okpdtrs)
            for v in vacs)
    # vacs = count_iterations(vacs, 'before .valid()')
    vacs = (RegionVacancy(v.title.text,
                          [regions[urllib.parse.urlparse(r.attrib['resource']).fragment]
                           for r in v.regions if r is not None],
                          v.okpdtrs)
            for v in vacs
            if v.valid())
    # vacs = count_iterations(vacs, 'after .valid()')
    return vacs


def demanded_programs_regionwise():
    for vac in get_vacancies_regionwise():
        progs = map(okpdtr_to_progs.__getitem__, vac.okpdtrs)
        progs = itertools.chain.from_iterable(progs)
        # not using itertools.product to avoid creating an iterable instead of iterator
        for prog in progs:
            for region in vac.regions:
                yield (prog, region)
                
                
#print(list(itertools.islice(demanded_programs_regionwise(), 1)))
#print(list(itertools.islice(get_vacancies_regionwise(), 1)))

prog_to_cnt = collections.Counter(demanded_programs_regionwise())

CPU times: user 1min 10s, sys: 418 ms, total: 1min 10s
Wall time: 1min 10s


In [9]:
(pd.DataFrame(((prog, region, cnt) for ((prog, region), cnt) in prog_to_cnt.items()),
              columns=['program', 'region', 'count'])
 .set_index(['program', 'region'])
).to_csv('demand.csv')

In [10]:
program_codes = pd.read_csv('programcode_to_id.csv')
program_codes = (series for (rowno, series) in program_codes.iterrows())
program_codes = dict(program_codes)
program_codes_inv = dict(((id, code) for (code, id) in program_codes.items()))
program_codes

{'01.03.01': '01.04.01',
 '01.03.02': '01.04.02',
 '01.03.03': '01.04.03',
 '01.03.04': '01.04.04',
 '01.04.01': '01.04.01',
 '01.04.02': '01.04.02',
 '01.04.03': '01.04.03',
 '01.04.04': '01.04.04',
 '01.05.01': '01.05.01',
 '02.03.01': '02.04.01',
 '02.03.02': '02.04.02',
 '02.03.03': '02.04.03',
 '02.04.01': '02.04.01',
 '02.04.02': '02.04.02',
 '02.04.03': '02.04.03',
 '03.03.01': '03.04.01',
 '03.03.02': '03.04.02',
 '03.03.03': '03.04.03',
 '03.04.01': '03.04.01',
 '03.04.02': '03.04.02',
 '03.04.03': '03.04.03',
 '03.05.01': '03.05.01',
 '04.03.01': '04.04.01',
 '04.03.02': '04.04.02',
 '04.04.01': '04.04.01',
 '04.04.02': '04.04.02',
 '04.05.01': '04.05.01',
 '05.03.01': '05.04.01',
 '05.03.02': '05.04.02',
 '05.03.03': '05.04.03',
 '05.03.04': '05.04.04',
 '05.03.05': '05.04.05',
 '05.03.06': '05.04.06',
 '05.04.01': '05.04.01',
 '05.04.02': '05.04.02',
 '05.04.03': '05.04.03',
 '05.04.04': '05.04.04',
 '05.04.05': '05.04.05',
 '05.04.06': '05.04.06',
 '06.03.01': '06.04.01',


In [20]:
demand = collections.defaultdict(
    lambda: collections.defaultdict(lambda: 0,
                                    ((p, 0) for p in program_codes.keys())))

for ((program, region), cnt) in prog_to_cnt.items():
    demand[region][program] = cnt
    
demand_wide = pd.DataFrame.from_dict(demand, orient='index').sort_index(axis=1)
demand_wide.to_csv('demand_wide.csv')
demand_wide.head()

Unnamed: 0,01.03.01,01.03.02,01.03.03,01.03.04,01.04.01,01.04.02,01.04.03,01.04.04,01.05.01,02.03.01,...,54.05.03,54.05.04,54.05.05,55.05.01,55.05.02,55.05.03,55.05.04,55.05.05,58.03.01,58.04.01
Алтайский край,159,212,159,159,0,0,0,0,0,212,...,0,0,0,0,0,0,0,0,0,0
Амурская область,177,198,177,177,0,0,0,0,0,198,...,0,0,0,0,0,0,0,0,0,0
Архангельская область,141,174,141,141,0,0,0,0,0,174,...,0,0,0,0,0,0,0,0,0,0
Астраханская область,32,46,32,32,0,0,0,0,0,46,...,0,0,0,0,0,0,0,0,0,0
Белгородская область,66,87,66,66,0,0,0,0,0,87,...,0,0,0,0,0,0,0,0,0,0


In [21]:
demand_wide[demand_wide['01.03.02'] != 0]

Unnamed: 0,01.03.01,01.03.02,01.03.03,01.03.04,01.04.01,01.04.02,01.04.03,01.04.04,01.05.01,02.03.01,...,54.05.03,54.05.04,54.05.05,55.05.01,55.05.02,55.05.03,55.05.04,55.05.05,58.03.01,58.04.01
Алтайский край,159,212,159,159,0,0,0,0,0,212,...,0,0,0,0,0,0,0,0,0,0
Амурская область,177,198,177,177,0,0,0,0,0,198,...,0,0,0,0,0,0,0,0,0,0
Архангельская область,141,174,141,141,0,0,0,0,0,174,...,0,0,0,0,0,0,0,0,0,0
Астраханская область,32,46,32,32,0,0,0,0,0,46,...,0,0,0,0,0,0,0,0,0,0
Белгородская область,66,87,66,66,0,0,0,0,0,87,...,0,0,0,0,0,0,0,0,0,0
Брянская область,25,42,25,25,0,0,0,0,0,42,...,0,0,0,0,0,0,0,0,0,0
Владимирская область,93,121,93,93,0,0,0,0,0,121,...,0,0,0,0,0,0,0,0,0,0
Волгоградская область,413,489,413,413,0,0,0,0,0,489,...,0,0,0,0,0,0,0,0,0,0
Вологодская область,76,118,76,76,0,0,0,0,0,118,...,0,0,0,0,0,0,0,0,0,0
Воронежская область,211,266,211,211,0,0,0,0,0,266,...,0,0,0,0,0,0,0,0,0,0


In [13]:
graduates = pd.read_csv(GRADUATES_FILE)
graduates = graduates[['region', 'program_code', 'n_graduates']]
graduates = graduates.groupby(['region', 'program_code']).sum()
graduates = graduates.rename({'n_graduates': 'count'}, axis=1)
graduates

Unnamed: 0_level_0,Unnamed: 1_level_0,count
region,program_code,Unnamed: 2_level_1
Алтайский край,01.03.02,19.0
Алтайский край,01.03.04,8.0
Алтайский край,01.04.02,14.0
Алтайский край,02.03.01,9.0
Алтайский край,02.03.02,8.0
Алтайский край,02.04.01,12.0
Алтайский край,03.03.02,13.0
Алтайский край,03.03.03,12.0
Алтайский край,03.04.02,9.0
Алтайский край,03.04.03,6.0


In [14]:
graduates.to_csv('supply.csv')

In [15]:
list(graduates.iterrows())[:4]

[(('Алтайский край', '01.03.02'), count    19.0
  Name: (Алтайский край, 01.03.02), dtype: float64),
 (('Алтайский край', '01.03.04'), count    8.0
  Name: (Алтайский край, 01.03.04), dtype: float64),
 (('Алтайский край', '01.04.02'), count    14.0
  Name: (Алтайский край, 01.04.02), dtype: float64),
 (('Алтайский край', '02.03.01'), count    9.0
  Name: (Алтайский край, 02.03.01), dtype: float64)]

In [17]:
supply = ((region, program, row['count']) for ((region, program), row) in graduates.iterrows())
supply = list(supply)

In [23]:
demand['Тюменская область']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'01.03.01': 173,
             '01.03.02': 208,
             '01.03.03': 173,
             '01.03.04': 173,
             '01.04.01': 0,
             '01.04.02': 0,
             '01.04.03': 0,
             '01.04.04': 0,
             '01.05.01': 0,
             '02.03.01': 208,
             '02.03.02': 208,
             '02.03.03': 173,
             '02.04.01': 0,
             '02.04.02': 0,
             '02.04.03': 0,
             '03.03.01': 173,
             '03.03.02': 173,
             '03.03.03': 46,
             '03.04.01': 0,
             '03.04.02': 0,
             '03.04.03': 0,
             '03.05.01': 0,
             '04.03.01': 173,
             '04.03.02': 173,
             '04.04.01': 0,
             '04.04.02': 0,
             '04.05.01': 0,
             '05.03.01': 0,
             '05.03.02': 173,
             '05.03.03': 45,
             '05.03.04': 173,
             '05.03.05': 2,
             '

In [71]:
import bokeh
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import layout
from bokeh.models.widgets import Tabs, Panel

output_notebook()
x_range = bokeh.models.ranges.FactorRange(*sorted(program_codes.keys()), factor_padding=10.0)

region_code0, region0 = next((r for r in regions.items() if len(demand[r[1]]) > 0))
demand0 = demand[region0]
demand0 = collections.OrderedDict(demand0)
supply0 = ((program, count)
           for (region, program, count)
           in supply
           if region == region0)
supply0 = collections.OrderedDict(supply0)
comparison = pd.DataFrame({'demand': pd.Series(demand0), 'supply': pd.Series(supply0)})


p = figure(x_range=x_range, plot_width=800,
           tools='box_zoom, xwheel_pan',
           active_drag='box_zoom',
           active_scroll='xwheel_pan')
p.vbar(x=list(demand0.keys()), top=list(demand0.values()),
       width=5.0, legend='demand', fill_alpha=.75, fill_color='red', line_alpha=.5)
p.vbar(x=list(supply0.keys()), top=list(supply0.values()),
       width=5.0, legend='supply', fill_alpha=.5, fill_color='blue', line_alpha=.5)
show(p)

print(region_code0, region0)

7200000000000 Тюменская область


In [74]:
import bokeh
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import layout
from bokeh.models.widgets import Tabs, Panel

output_notebook()
x_range = bokeh.models.ranges.FactorRange(*sorted(program_codes.keys()), factor_padding=10.0)

def plot_region(region0):
    demand0 = demand[region0]
    demand0 = collections.OrderedDict(demand0)
    supply0 = ((program, count)
               for (region, program, count)
               in supply
               if region == region0)
    supply0 = collections.OrderedDict(supply0)
    comparison = pd.DataFrame({'demand': pd.Series(demand0), 'supply': pd.Series(supply0)})


    p = figure(x_range=x_range, plot_width=800,
               tools='box_zoom, xwheel_pan',
               active_drag='box_zoom',
               active_scroll='xwheel_pan')
    p.vbar(x=list(demand0.keys()), top=list(demand0.values()),
           width=5.0, legend='demand', fill_alpha=.75, fill_color='red', line_alpha=.5)
    p.vbar(x=list(supply0.keys()), top=list(supply0.values()),
           width=5.0, legend='supply', fill_alpha=.5, fill_color='blue', line_alpha=.5)
    return p

def plot_regions():
    tabs = []
    for region in regions.values():
        p = plot_region(region)
        tabs.append(Panel(child=p, title=region))
    tabs = Tabs(tabs=tabs)
    return tabs

show(plot_regions())