In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import requests
import contextlib
import functools
import itertools
import collections
import pprint
import glob

plt.ion()

In [6]:
example_xls = 'Своды ВПО-1 2018/Государственные/Воронежская область_ГОС_очная.xls'

In [7]:
import xlrd

In [8]:
ex = xlrd.open_workbook(example_xls)

In [9]:
ex.sheet_names()

['Р1_1',
 'P1_2(1)',
 'P1_2(2)',
 'Р2_1_1',
 'Р2_1_2(1)',
 'Р2_1_2 (2)',
 'Р2_1_2 (3)',
 'Р2_1_2 (4)',
 'Р2_1_3(1)',
 'Р2_1_3(2)',
 'Р2_1_4',
 'Р2_1_5 (1)',
 'Р2_1_5 (2)',
 'Р2_1_6',
 'Р2_2',
 'Р2_3',
 'Р2_4',
 'Р2_5',
 'Р2_6',
 'Р2_7',
 'Р2_8',
 'Р2_9',
 'Р2_10',
 'Р2_11(1)',
 'P2_11(2)',
 'Р2_12(1)',
 'Р2_12(2)',
 'Р2_12(3)',
 'Р2_13',
 'Р3_1',
 'P3_2',
 'Р3_3_1',
 'P3_3_2',
 'P3_4',
 'P3_5',
 'P3_6',
 'P3_7_1',
 'P3_7_2']

In [10]:
grad_sheet = ex.sheet_by_name('Р2_1_3(1)')

In [11]:
list(itertools.islice(grad_sheet.get_rows(), 50))

[[text:'Воронежская область',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:''],
 [text:'Государственные, Муниципальные',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:''],
 [text:'очная',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:''],
 [text:'2.1.3. Распределение выпуска бакалавров, специалистов, магистров по направлениям подготовки и специальностям',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:'',
  empty:''],
 [text:'Коды по ОКЕИ: 

In [17]:
sheet = ex.sheets()[0]
sheet.name

'Р1_1'

In [49]:
number_example = next(itertools.islice(grad_sheet.get_rows(), 50, 51))[1]
print(number_example)
print(number_example.value)
print(number_example.ctype)

number:1.0
1.0
2


In [51]:
empty_example = next(r for r in grad_sheet.get_rows())[-1]
print(empty_example)
print(repr(empty_example.value))
print(empty_example.ctype)

empty:''
''
0


So, one task for tomorrow is to write a VPO-1 reader 

- [ ] `VPO1Reader`, handles a single `.xls` file.
    Possible solution:
    - [x] Table's header
        - [x] Get the number of columns
        - [x] Find the separator between the header and the numbers
        - [x] For each row above separator, enumerate its cells, accumulating the "atomic column"'s name
    - [x] Data: traverse rows below separator, appending values into vectors of appropriate "atomic column"s
- [x] `VPOSet`, handles the folder with region-wise reports
- [ ] Download function

In [168]:
import xlrd
import re
import typing


# STARTSWITH_UNITS = 'коды по океи: '
            
class VPO1ParsingError(Exception):
    pass

class VPO1Row:
    def __init__(self, cells):
        # consider, for `str`s:
        # cells = map(str.strip, cells)
        # cells = map(str.lower, cells)
        # plus multiple spaces, etc
        cells = list(cells)
        self.cells = cells
        self.atomic_to_full = [i if c is not None else None for i, c in enumerate(cells)]
        self.atomic_to_full[0] = 0
        self._n_cells = sum(map(lambda x: x is not None, self.atomic_to_full))
        for i, ref in enumerate(self.atomic_to_full):
            if ref is None:
                self.atomic_to_full[i] = self.atomic_to_full[i - 1]
        for ref in self.atomic_to_full:
            assert ref is not None
    @property
    def n_cells(self):
        return self._n_cells
    @property
    def n_atomic_cells(self):
        return len(self.cells)
    def clone(self):
        return VPO1Row(self.cells)
    def __getitem__(self, atomic_cell_id):
        cell_id = self.atomic_to_full[atomic_cell_id]
        return self.cells[cell_id]
    def __setitem__(self, atomic_cell_id, value):
        cell_id = self.atomic_to_full[atomic_cell_id]
        self.cells[cell_id] = value
    def __repr__(self):
        return repr(self.cells)
    
class VPO1Header:
    DESC = ['region', 'is_governmental', 'fulltime', 'section_name', 'units']
    DESC = dict(((x, i) for i, x in enumerate(DESC)))
    def __init__(self, rows: typing.List[VPO1Row], n_cols: int):
        
        if len(rows) < len(VPO1Header.DESC):
            raise VPO1ParsingError(
                'Expected at least {} rows in the header (for the traits) got {}'
                .format(len(rows, len(VPO1Header.DESC)))
            )
        if not isinstance(rows[VPO1Header.DESC['units']][0], str):
            raise VPO1ParsingError('Units of measurement misspecified')
        if not 'ОКЕИ' in rows[VPO1Header.DESC['units']][0]:
            raise VPO1ParsingError('Units of measurement misspecified')
        
        
        TRANSFORMS = collections.defaultdict(lambda: (lambda x: x))
        # TRANSFORMS['units'] = lambda x: x.replace(STARTSWITH_UNITS, '')
        for field, i in VPO1Header.DESC.items():
            transform = TRANSFORMS[field]
            setattr(self, field, transform(rows[i][0]))
        
        def table_header_transform(x):
            if x is None:
                return ''
            return str(x)
        table_header_start = len(VPO1Header.DESC) + 1
        table_header = (
            (row[i] for row in rows[table_header_start:])
            for i in range(n_cols)
        )
        table_header = [' '.join(map(table_header_transform, col)) for col in table_header]
        self.table_header = table_header
        self.n_cols = n_cols
        
def vpo1_table(header: VPO1Header,
               rows: typing.List[VPO1Row]) -> pd.DataFrame:
    index = [row[0] for row in rows]
    cols = header.table_header[1: ]
    data = data=[[row[i] for i in range(1, header.n_cols)] for row in rows]
    df = pd.DataFrame(data=data,
                      index=index,
                      columns=cols)
    return df

class VPO1Page:
    def __init__(self, rows: typing.List[VPO1Row]):
        try:
            idx_col_numbers = next(i for i, x in enumerate(rows)
                                   if isinstance(x[0], float) and x[0] == 1)
        except StopIteration:
            raise VPO1ParsingError('Invalid sheet. You probably should just skip it.')
        n_cols = max(map(lambda r: r.n_cells, rows))
        self.header = VPO1Header(rows[:idx_col_numbers], n_cols)
        self.table = vpo1_table(self.header, rows[idx_col_numbers + 1: ])
        for field in VPO1Header.DESC:
            setattr(self, field, getattr(self.header, field))
    @staticmethod
    def try_yield_parsed(rows):
        try:
            yield VPO1Page(rows)
        except VPO1ParsingError:
            pass

    
class VPO1:
    def __init__(self, sheets: typing.Dict[str, typing.List[VPO1Row]]):
        # pages = map(lambda name, rows: zip([name], VPO1Page.try_yield_parsed(rows)), sheets.items())
        pages = (list(zip([name], VPO1Page.try_yield_parsed(rows))) for name, rows in sheets.items())
        pages = itertools.chain.from_iterable(pages)
        pages = list(pages)
        self.names, self.pages = [n for n, p in pages], [p for n, p in pages]
        if len(self.pages) == 0:
            raise VPO1ParsingError('No pages could be parsed')
        page0 = self.pages[0]
        for field in VPO1Header.DESC:
            setattr(self, field, getattr(page0, field))
    @staticmethod
    def try_yield_parsed(sheets):
        try:
            yield VPO1(sheets)
        except VPO1ParsingError:
            pass
def unpack_cell(cell):
    # TODO: apply normalization, stripping str's, etc -- right here
    return cell.value if cell.ctype != 0 else None

def unpack_row(row):
    return map(unpack_cell, row)

def read_vpo1(filename):
    xls = xlrd.open_workbook(filename)
    vpo1 = dict((s.name, list(map(VPO1Row,
                                  map(unpack_row, s.get_rows()))))
                for s in xls.sheets())
    vpo1 = VPO1(vpo1)
    return vpo1


class VPO1Set:
    def _init_deep_inspect(self, path):
        self.files = (
            glob.glob(os.path.join(path, 'СВОД_ВПО1*.xls'))
            + glob.glob(os.path.join(path, 'Своды ВПО-1*/*/*.xls')))
        self.file_to_i = dict((f, i) for i, f in enumerate(self.files))
        self.file_to_traits = list()
        for f in self.files:
            # TODO: use iterables, not lists; read only header; don't create pd.DataFrame yet;
            # NB: splitting names would've been faster but I'm rather hesitant to bind to them
            #     as they might be unstable;
            vpo1 = read_vpo1(f).pages[0].header
            # TODO: namedtuple
            self.file_to_traits.append(dict((field, getattr(vpo1, field)) for field in VPO1Header.DESC))
    def _init_name_inspect(self, path):
        for root, dirs, files in os.walk(path):
            files = ((split(f) for split in [self.split_name_country, self.split_name_region])
                     for f in files)
            files = (itertools.islice((splitted for splitted in f if splitted is not None), 0, 1)
                     for f in files)
            files = itertools.chain.from_iterable(files)
            files = list(files)
            for filename, traits in files:
                self.files.append(filename)
                self.file_to_traits.append(traits)
    @staticmethod
    def split_name_country(s):
        PAT_COUNTRY = re.compile(r'СВОД_ВПО1_(?P<is_governmental>[А-Я]+)_(?P<fulltime>[А-Яа-я\s-]+)\.xls')
        m = PAT_COUNTRY.match(s)
        if not m:
            return None
        return (s, dict(region='Russia',
                    is_governmental=m.group('is_governmental'),
                    fulltime=m.group('fulltime')))
    @staticmethod
    def split_name_region(s):
        PAT_REGION = re.compile(r'(?P<region>[а-яА-Я\s-]+)_(?P<is_governmental>[А-Я]+)_(?P<fulltime>[а-яА-Я\s-]+)\.xls')
        m = PAT_REGION.match(s)
        if not m:
            return None
        return (s, dict(m.groupdict()))
    def __init__(self, path, deep=False):
        self.files = []
        self.file_to_traits = []
        if deep:
            self._init_deep_inspect(path)
        else:
            self._init_name_inspect(path)
        VPO1SET_TRAITS = ['region', 'is_governmental', 'fulltime']
        for field in VPO1SET_TRAITS:
            setattr(self, field + 's', list(set(map(lambda x: x[field], self.file_to_traits))))
                    
                
    def get_vpo1(region, is_governmental, fulltime):
        return next(x for x, traits in self.file_to_traits
                    if region == traits['region']
                       and is_governmental == traits['is_governmental']
                       and fulltime == traits['fulltime'])

In [170]:
import unittest


class VPO1AssumptionsTest(unittest.TestCase):
    def test_sheet_grad(self):
        filename = 'Своды ВПО-1 2018/Государственные/Воронежская область_ГОС_очная.xls'
        sheetname = 'Р2_1_2 (4)'
        xls = xlrd.open_workbook(filename)
        sheet = xls.sheet_by_name(sheetname)
        vpo1page = list(map(VPO1Row, map(unpack_row, sheet.get_rows())))
        vpo1page = VPO1Page(vpo1page)

    def test_read_vpo1(self):
        vpo1 = read_vpo1('Своды ВПО-1 2018/Государственные/Воронежская область_ГОС_очная.xls')
        self.assertEqual(vpo1.region, 'Воронежская область')
        self.assertEqual(vpo1.fulltime, 'очная')
    def test_vpo1set_nonempty(self):
        vpo1set = VPO1Set('.')
        assert len(vpo1set.files) > 0
        

suite = unittest.TestLoader().loadTestsFromTestCase(VPO1AssumptionsTest)
unittest.TextTestRunner(verbosity=2).run(suite)

test_read_vpo1 (__main__.VPO1AssumptionsTest) ... ok
test_sheet_grad (__main__.VPO1AssumptionsTest) ... ok
test_vpo1set_nonempty (__main__.VPO1AssumptionsTest) ... ok

----------------------------------------------------------------------
Ran 3 tests in 0.704s

OK


<unittest.runner.TextTestResult run=3 errors=0 failures=0>