In [11]:
%load_ext autoreload
%autoreload 2

from IPython.display import display_html, HTML, display
css = !cat styles.css
display(HTML(''.join(css)))

import numpy as np

import display_helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
from erafixer import EraFixer, COL_LOOKUP

In [13]:
# Excel file
# fn = 'ERA2018_Pubs_02_27-Oct-17.xlsx'
fn = 'all.xlsx'

### Command Line Help

```
➜  python erafixer.py --help
usage: erafixer.py [-h] [--detect_author AUTHOR] [--detect_journal JOURNAL]
                   [--set_discipline DISCIPLINE] [--split_disciplines]
                   [--prefix PREFIX] [--carry_forward_forcs]
                   [--set_forc FORC_STRING] [--justify JUSTIFY_STRING]
                   [--sheet_index SHEET_INDEX] [--verbose]
                   ERAFILE

Process and update ERA codes

positional arguments:
  ERAFILE               ERA file as excel spreadsheet

optional arguments:
  -h, --help            show this help message and exit
  --detect_author AUTHOR
                        Part of the author name in AUTHOR column, should be
                        unique substring
  --detect_journal JOURNAL
                        Part of the author name in JOURNAL column, should be
                        unique substring
  --set_discipline DISCIPLINE
                        Discipline to be set
  --split_disciplines   Split ERAFILE into different files called
                        <PREFIX>_<DISC>.xlsx for each discipline
  --prefix PREFIX       Prefix for split-disciplines
  --carry_forward_forcs
                        Carry 2015 codes forward into the corresponding 2018
                        columns
  --set_forc FORC_STRING
                        Apply the FORC string
  --justify JUSTIFY_STRING
                        Justification string [optional for --set_forc]
  --sheet_index SHEET_INDEX
                        Excel sheet to use, defaults to first sheet
  --verbose             Show some output, default false

```

### Create EraFixer instance

Note the `sheet_index=1` passed to constructor. If not passed and more than one sheet is present, prompt will ask for index number (works in notebook and cli)

In [50]:
erafixer = EraFixer(fn=fn, sheet_index=1, verbose=True)

Parsing file all.xlsx
Using sheet index 1 - Tab 1 for 02
Adding HANDLED (default 0) column to spreadsheet
Adding DISCIPLINE (default NaN) column to spreadsheet


### Set Discipline

Corresponds to the `--detect_author` and `--detect_journal` options

#### By Author

In [52]:
erafixer.set_author_discipline('Zvyagin', 'biophotonics')
erafixer.set_author_discipline('Schwab', 'astro')
erafixer.set_author_discipline('Gee', 'astro')

Setting discipline to 'biophotonics' for 'Zvyagin'
Found 0 matches for 'Zvyagin' with empty discipline
Found 0 total rows for AUTHORS=Zvyagin
Setting discipline to 'astro' for 'Schwab'
Found 0 matches for 'Schwab' with empty discipline
Found 0 total rows for AUTHORS=Schwab
Setting discipline to 'astro' for 'Gee'
Found 0 matches for 'Gee' with empty discipline
Found 0 total rows for AUTHORS=Gee


#### By Journal

In [53]:
erafixer.set_journal_discipline('astrophysical', 'astro')
erafixer.set_journal_discipline('geophysical', 'geo')
erafixer.set_journal_discipline('geophysics', 'geo')
erafixer.set_journal_discipline('optics express', 'photonics')

Setting discipline to 'astro' for 'astrophysical'
Found 159 matches for 'astrophysical' with empty discipline
Found 159 total rows for PARENT_DOC=astrophysical
Setting discipline to 'geo' for 'geophysical'
Found 0 matches for 'geophysical' with empty discipline
Found 0 total rows for PARENT_DOC=geophysical
'geo' not in PhysAstro, setting HANDLED=1
Setting discipline to 'geo' for 'geophysics'
Found 4 matches for 'geophysics' with empty discipline
Found 4 total rows for PARENT_DOC=geophysics
'geo' not in PhysAstro, setting HANDLED=1
Setting discipline to 'photonics' for 'optics express'
Found 96 matches for 'optics express' with empty discipline
Found 96 total rows for PARENT_DOC=optics express


#### Save
There is no explicit save for the commands above. The cli interface **will** automatically save back to spreadsheet when given the `--detect_author` or `--detect_journal` options

In [54]:
erafixer.save()

Writing sheet 'Legend' to all.xlsx
Writing sheet 'Tab 1 for 02' to all.xlsx
Writing sheet 'Tab 2 for 02' to all.xlsx


'all.xlsx'

In [8]:
erafixer.split_disciplines('disc')

Writing dataframe to disc_geo.xlsx with 4 records
Writing dataframe to disc_biophotonics.xlsx with 39 records
Writing dataframe to disc_astro.xlsx with 192 records
Writing dataframe to disc_photonics.xlsx with 94 records


['disc_geo.xlsx',
 'disc_biophotonics.xlsx',
 'disc_astro.xlsx',
 'disc_photonics.xlsx']

#### Carry Forward FORCS

In [9]:
astro_erafixer = EraFixer(fn='disc_astro.xlsx', verbose=True)

Parsing file disc_astro.xlsx
Using sheet index 0 - Sheet1


In [10]:
astro_erafixer.carry_forward_forcs()

Found 192 matches for HANDLED=0
Copying 2015 FOR codes to 2018 for unhandled rows


In [11]:
astro_erafixer.save()

Writing sheet 'Sheet1' to disc_astro.xlsx


'disc_astro.xlsx'

#### Apply FORC_STRING

In [12]:
astro_erafixer = EraFixer(fn='disc_astro.xlsx', verbose=True)

Parsing file disc_astro.xlsx
Using sheet index 0 - Sheet1


In [13]:
forc_string = '0201'

astro_erafixer.set_forc_string(forc_string)

In [18]:
# forc_string = '0201:40,0203'
# forc_string = '0201:40,0203:60'
# forc_string = '0201:40,0203:30,0204'
# forc_string = '0201:40,0203:25,0204:30'

justify_string = None
# justify_string = 'Development of Raman lasers'

In [19]:
astro_erafixer.save()

Writing sheet 'Sheet1' to disc_astro.xlsx


'disc_astro.xlsx'

### Test author match

We want to make sure we are matching the correct authors. The author should be supplied as the full last name of the author in question. Names are not stored correctly in the excel sheet (should be in UTF-8) but will still match on bad characters. See top name below.

In [20]:
authors = [
    'JelÃ­nkovÃ¡',
    'gee',
    'McGee',
    '    sTeEl ',
    'steel',
    'steele',
    'casteels',
    'xia',
    'xiao',
    'Zvyagin'
]

for author in authors:
    
    row_match = erafixer.get_matching_rows(author, 'AUTHORS', blank_discipline=False)
    
    if(row_match):
        display_helpers.show_matches(erafixer, row_match, author)

Found 4 matches for AUTHORS=JelÃ­nkovÃ¡


Found 1 matches for AUTHORS=gee


Found 2 matches for AUTHORS=McGee


Found 102 matches for AUTHORS=    sTeEl 


Found 102 matches for AUTHORS=steel


Found 7 matches for AUTHORS=steele


Found 1 matches for AUTHORS=casteels


Found 7 matches for AUTHORS=xia


Found 3 matches for AUTHORS=xiao


Found 39 matches for AUTHORS=Zvyagin
