In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display_html, HTML, display
css = !cat styles.css
display(HTML(''.join(css)))

import numpy as np

import display_helpers

In [2]:
from erafixer import EraFixer, COL_LOOKUP

In [3]:
# Excel file
# fn = 'ERA2018_Pubs_02_27-Oct-17.xlsx'
fn = 'all.xlsx'

### Command Line Help

```
➜  python erafixer.py --help
usage: erafixer.py [-h] [--detect_author AUTHOR] [--detect_journal JOURNAL]
                   [--set_discipline DISCIPLINE] [--split_disciplines]
                   [--prefix PREFIX] [--carry_forward_forcs]
                   [--set_forc FORC_STRING] [--justify JUSTIFY_STRING]
                   [--sheet_index SHEET_INDEX] [--verbose]
                   ERAFILE

Process and update ERA codes

positional arguments:
  ERAFILE               ERA file as excel spreadsheet

optional arguments:
  -h, --help            show this help message and exit
  --detect_author AUTHOR
                        Part of the author name in AUTHOR column, should be
                        unique substring
  --detect_journal JOURNAL
                        Part of the author name in JOURNAL column, should be
                        unique substring
  --set_discipline DISCIPLINE
                        Discipline to be set
  --split_disciplines   Split ERAFILE into different files called
                        <PREFIX>_<DISC>.xlsx for each discipline
  --prefix PREFIX       Prefix for split-disciplines
  --carry_forward_forcs
                        Carry 2015 codes forward into the corresponding 2018
                        columns
  --set_forc FORC_STRING
                        Apply the FORC string
  --justify JUSTIFY_STRING
                        Justification string [optional for --set_forc]
  --sheet_index SHEET_INDEX
                        Excel sheet to use, defaults to first sheet
  --verbose             Show some output, default false

```

### Create EraFixer instance

Note the `sheet_index=1` passed to constructor. If not passed and more than one sheet is present, prompt will ask for index number (works in notebook and cli)

In [4]:
erafixer = EraFixer(fn=fn, sheet_index=1, verbose=True)

Parsing file all.xlsx
Using sheet index 1 - Tab 1 for 02


### Set Discipline

Corresponds to the `--detect_author` and `--detect_journal` options

#### By Journal

In [5]:
journal_list = [
    ('astrophysical', 'astro'),
    ('geophysics', 'geo'),
    ('optics express', 'photonics')
]

In [6]:
for journal in journal_list:
    erafixer.set_journal_discipline(journal[0], journal[1])

Setting discipline to 'astro' for 'astrophysical' on 159 rows
Setting discipline to 'geo' for 'geophysics' on 4 rows
'geo' not in PhysAstro, setting HANDLED=1
Setting discipline to 'photonics' for 'optics express' on 96 rows


#### By Author

In [7]:
# WARNING: This generates a false positive, see row 1 of disc_astro.xlsx after split
erafixer.set_author_discipline('marco', 'astro')

Setting discipline to 'astro' for 'marco' on 52 rows


In [8]:
author_list = [
    ('zvyagin', 'biophotonics'),
    ('schwab', 'astro'),
    ('gee', 'astro'),
    ('mcgee', 'astro'),
    ('steel', 'quantum'),
    ('steele', 'astro'),
    ('casteels', 'astro'),
    ('spence', 'photonics')
]

In [9]:
for author in author_list:
    erafixer.set_author_discipline(author[0], author[1])

Setting discipline to 'biophotonics' for 'zvyagin' on 37 rows
Setting discipline to 'astro' for 'schwab' on 30 rows
Setting discipline to 'astro' for 'gee' on 1 rows
Setting discipline to 'astro' for 'mcgee' on 1 rows
Setting discipline to 'quantum' for 'steel' on 86 rows
Setting discipline to 'astro' for 'steele' on 6 rows
Setting discipline to 'astro' for 'casteels' on 1 rows
Setting discipline to 'photonics' for 'spence' on 30 rows


#### Save
There is no explicit save for the commands above. The cli interface **will** automatically save back to spreadsheet when given the `--detect_author` or `--detect_journal` options

In [10]:
erafixer.save()

File saved: all.xlsx


'all.xlsx'

#### Split Disciplines

Note: There could also feasibly be a `--detect_discipline` command that would work against the giant spreadsheet. 

In [11]:
erafixer.split_disciplines('disc')

File saved: disc_quantum.xlsx
File saved: disc_geo.xlsx
File saved: disc_astro.xlsx
File saved: disc_biophotonics.xlsx
File saved: disc_photonics.xlsx


['disc_quantum.xlsx',
 'disc_geo.xlsx',
 'disc_astro.xlsx',
 'disc_biophotonics.xlsx',
 'disc_photonics.xlsx']

#### Carry Forward FORCS

In [12]:
astro_erafixer = EraFixer(fn='disc_astro.xlsx', verbose=True)

Parsing file disc_astro.xlsx
Using sheet index 0 - Sheet1


In [13]:
astro_erafixer.carry_forward_forcs()

Copying 2015 FOR codes to 2018 for unhandled rows
Found 250 total unhandled rows
Moving 4 values to for2_e18
Moving 0 values to for3perc_e18
Moving 0 values to for3_e18
Moving 54 values to for1_e18
Moving 1 values to for4_e18
Moving 4 values to for2perc_e18
Moving 1 values to for4perc_e18
Moving 54 values to for1perc_e18


In [14]:
astro_erafixer.save()

File saved: disc_astro.xlsx


'disc_astro.xlsx'

#### Apply FORC_STRING

In [15]:
astro_erafixer = EraFixer(fn='disc_astro.xlsx', verbose=True)

Parsing file disc_astro.xlsx
Using sheet index 0 - Sheet1


In [16]:
forc_string = '0201'

astro_erafixer.set_forc_string(forc_string)

Applying FORC_STRING '0201'


In [17]:
astro_erafixer.save()

File saved: disc_astro.xlsx


'disc_astro.xlsx'

##### More examples

In [18]:
# Other examples
photonics_erafixer = EraFixer(fn='disc_photonics.xlsx', verbose=True)

author = 'spence'

forc_string = '0205'
# forc_string = '0201:40,0203'
# forc_string = '0201:40,0203:60'
# forc_string = '0201:40,0203:30,0204'
# forc_string = '0201:40,0203:25,0204:30'

# justify_string = None
justify_string = 'Development of Raman lasers'

photonics_erafixer.set_forc_string(forc_string, justify_string=justify_string, author=author)
photonics_erafixer.save()

Parsing file disc_photonics.xlsx
Using sheet index 0 - Sheet1
Applying FORC_STRING '0205'
File saved: disc_photonics.xlsx


'disc_photonics.xlsx'

### Test author match

We want to make sure we are matching the correct authors. The author should be supplied as the full last name of the author in question. Names are not stored correctly in the excel sheet (should be in UTF-8) but will still match on bad characters. See top name below.

In [19]:
authors = [
    'JelÃ­nkovÃ¡',
    'gee',
    'McGee',
    '    sTeEl ',
    'steel',
    'steele',
    'casteels',
    'xia',
    'xiao',
    'Zvyagin'
]

for author in authors:
    
    row_match = erafixer.get_matching_rows(author, 'AUTHORS', blank_discipline=False)
    print(row_match)
    if(row_match):
        display_helpers.show_matches(erafixer, row_match, author)

[242, 268, 1365, 1492]


[1374]


[29, 374]


[3, 8, 42, 56, 64, 69, 93, 109, 118, 122, 135, 157, 174, 189, 206, 214, 220, 239, 249, 267, 286, 308, 319, 320, 339, 510, 535, 735, 748, 757, 763, 764, 770, 772, 776, 1050, 1054, 1060, 1061, 1072, 1086, 1090, 1092, 1094, 1097, 1108, 1110, 1114, 1119, 1134, 1135, 1138, 1148, 1153, 1158, 1162, 1167, 1168, 1174, 1179, 1182, 1192, 1201, 1206, 1215, 1216, 1219, 1223, 1240, 1246, 1283, 1290, 1292, 1301, 1308, 1309, 1313, 1325, 1340, 1341, 1342, 1345, 1348, 1371, 1373, 1380, 1457, 1460, 1482, 1486, 1490, 1499, 1504, 1573, 1576, 1618, 1637, 1641, 1657, 1679, 1680, 1681]


[3, 8, 42, 56, 64, 69, 93, 109, 118, 122, 135, 157, 174, 189, 206, 214, 220, 239, 249, 267, 286, 308, 319, 320, 339, 510, 535, 735, 748, 757, 763, 764, 770, 772, 776, 1050, 1054, 1060, 1061, 1072, 1086, 1090, 1092, 1094, 1097, 1108, 1110, 1114, 1119, 1134, 1135, 1138, 1148, 1153, 1158, 1162, 1167, 1168, 1174, 1179, 1182, 1192, 1201, 1206, 1215, 1216, 1219, 1223, 1240, 1246, 1283, 1290, 1292, 1301, 1308, 1309, 1313, 1325, 1340, 1341, 1342, 1345, 1348, 1371, 1373, 1380, 1457, 1460, 1482, 1486, 1490, 1499, 1504, 1573, 1576, 1618, 1637, 1641, 1657, 1679, 1680, 1681]


[491, 872, 964, 970, 971, 987, 990]


[887]


[5, 7, 15, 303, 511, 1234, 1249]


[273, 872, 1366]


[50, 57, 70, 156, 203, 738, 740, 747, 1088, 1115, 1117, 1121, 1123, 1126, 1132, 1147, 1155, 1170, 1171, 1199, 1202, 1207, 1229, 1239, 1274, 1278, 1282, 1295, 1298, 1319, 1323, 1328, 1481, 1487, 1493, 1505, 1579, 1597, 1632]
