This notebook makes some visualizations, both static and interactive, with [Altair](https://altair-viz.github.io/).

Make sure Altair is installed:

```python
!pip install altair vega_datasets
```

In [93]:
# if you want to save these altair plots for hosting in a Jekyll directory, make sure to set this directory
myJekyllDir = '/Users/jnaiman/ReadingTimeMachine.github.io/assets/json/'

In [233]:
import altair as alt
import string
import pickle
import pandas as pd
import numpy as np

# debug
from importlib import reload
import utils
reload(utils)

from utils import subset_by_percent, return_matrix_chart_withHist

In [95]:
data_dir = './data/'

## Data

In [194]:
with open(data_dir+'letters.pickle','rb') as f:
    letters = pickle.load(f)
with open(data_dir+'words.pickle','rb') as f:
    words = pickle.load(f)
# all clean words
with open(data_dir+'words_cleaned.pickle','rb') as f:
    words_cleaned = pickle.load(f)

Format characters for plotting:

In [97]:
# matrix
normalized=True
pdf_letters = []; ocr_letters = []; counts = []
counts_un = []
for pl,ols in letters.items():
    if normalized:
        cdiv=0.0
        for ol,c in ols.items(): # % in OCR
            cdiv += c
    else:
        cdiv = 1.0
        
    for ol,c in ols.items():
        pdf_letters.append(pl)
        ocr_letters.append(ol)
        counts.append(c/cdiv*100)
        counts_un.append(c)
        
df_char = pd.DataFrame({'pdf_letters':pdf_letters,
                        'ocr_letters':ocr_letters,
                        'counts':counts, 
                       'counts unnormalized':counts_un})
# save
###df_char.to_csv(char_counts_df_file, index=False)

In [98]:
df_char.head()

Unnamed: 0,pdf_letters,ocr_letters,counts,counts unnormalized
0,r,r,98.29838,9979050
1,r,^,0.753827,76527
2,r,7,0.000493,50
3,r,u,0.060354,6127
4,r,x,0.089462,9082


# Time distribution of articles

Interactive (static plot is in paper)

In [135]:
df_times = pd.read_csv('./data/all_time_plot.csv')
df_times.head()

Unnamed: 0,Date,All ArXiv,Astronomy Articles,Compiled,Marked & Compiled,Aligned
0,1991-07-01,2,0,0,0,0
1,1991-08-01,28,0,0,0,0
2,1991-09-01,58,0,0,0,0
3,1991-10-01,76,0,0,0,0
4,1991-11-01,64,0,0,0,0


In [140]:
min_time,max_time = df_times[df_times['Astronomy Articles']!= 0]['Date'].min(),df_times[df_times['Astronomy Articles']!= 0]['Date'].max()
min_time,max_time

('1993-03-01', '2011-12-01')

In [147]:
df_times_subset = df_times[(df_times['Date']>=min_time)&(df_times['Date']<=max_time)]
# for plotting
df_times_subset = df_times_subset.replace(0,np.nan)

In [148]:
# melt
df_melt = df_times_subset.melt('Date',var_name = 'Article Subset', value_name= 'Number')
df_melt.head()

Unnamed: 0,Date,Article Subset,Number
0,1993-03-01,All ArXiv,492.0
1,1993-04-01,All ArXiv,498.0
2,1993-05-01,All ArXiv,526.0
3,1993-06-01,All ArXiv,524.0
4,1993-07-01,All ArXiv,616.0


In [156]:
df_times_subset.columns[1:].tolist()

['All ArXiv', 'Astronomy Articles', 'Compiled', 'Marked & Compiled', 'Aligned']

In [164]:
# create chart title (https://stackoverflow.com/questions/57244390/how-to-add-a-subtitle-to-an-altair-generated-chart)
chart_title = alt.TitleParams(
    "Time distribution of articles in our dataset",
    subtitle=["(Hover over lines to see exact numbers)"],
    #subtitle=["First line that will not wrap no matter how much text it has", "Second line"],
)

# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection_point(nearest=True, on='mouseover',
                        fields=['Date'], empty=False)

# The basic line
#line = alt.Chart(source).mark_line(interpolate='basis').encode(
line = alt.Chart(df_melt,title=chart_title).mark_line().encode(
    x='Date:T',
    y=alt.Y('Number:Q',scale=alt.Scale(type='symlog')),
    color=alt.Color('Article Subset:N',sort=df_times_subset.columns[1:].tolist())
)

# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(df_melt).mark_point().encode(
    x='Date:T',
    opacity=alt.value(0),
).add_params(
    nearest
)

# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)

# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'Number:Q', alt.value(' '))
)

# Draw a rule at the location of the selection
rules = alt.Chart(df_melt).mark_rule(color='gray').encode(
    x='Date:T',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
time_chart = alt.layer(
    line, selectors, points, rules, text
).properties(
    width=600, height=300
)

In [165]:
time_chart

In [166]:
time_chart.save(myJekyllDir + 'timechart.json')

# Characters

## Static, just upper-case alphabetic characters (in paper)

In [99]:
# get upper case alpha characters
alphas = list(string.ascii_lowercase)
alphas_lower = alphas.copy()
# add larger ones
for a in alphas_lower:
    alphas.append(a.upper())

upperAlphas = alphas[26:]
print(upperAlphas)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [100]:
df_upperA = df_char.loc[(df_char['pdf_letters'].isin(upperAlphas))&(df_char['ocr_letters'].isin(upperAlphas))]
# mostly for formatting
df_upperA = subset_by_percent(df_upperA.copy(), tol_count = 0) # formatting
df_upperA.head()

shape of output= (576, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,A,A,96.392449,311925.0,96.39%
1,A,X,0.58282,1886.0,0.58%
2,A,C,0.017614,57.0,0.02%
3,A,I,0.014215,46.0,0.01%
4,A,L,0.040482,131.0,0.04%


In [189]:
alphas_hist = ['A', 'K'] # letters to histogram

# other params
min_percent = 0.01 # only plot things that are at this level (in %) or above in terms of frequency in dataset
percent_column = 'counts' # counts or unnormalized?
scheme = 'viridis'
height=400 # matrix
width=400 # matrix
log=True # log of colormap?
color_title = 'Percent in %'
percent_column = "% of all OCR tokens"
count_column = "Total Count of PDF token"
pdf_title='GT Characters'
ocr_title='OCR Characters'

#percent_column = "% of all OCR tokens"
hist_width = 400

In [192]:
hist_height = height // (len(alphas_hist)+1)

sort_pdf = np.unique(df_upperA['pdf_letters']).tolist()
sort_pdf.sort()
extra_ocr = []
for o in df_upperA['ocr_letters'].unique():
    if o not in sort_pdf:
        extra_ocr.append(o)

extra_ocr.sort()

sort_ocr = sort_pdf.copy()
sort_ocr.extend(extra_ocr)

# legend placement
length = 200
legendY = -50
legendX = 0 + width//2 - length//2

if log:
    color = alt.Color(percent_column+":Q", scale=alt.Scale(type='log',scheme=scheme, domain=[min_percent, 100]),
                      title=color_title, legend=alt.Legend(
                        orient='none',
                        legendX=legendX, legendY=legendY,
                        direction='horizontal',
                        titleAnchor='middle', gradientLength=length))
else:
    color = alt.Color(percent_column+"value:Q", scale=alt.Scale(scheme=scheme,domain=[min_percent, 100]),
                      title=color_title)

chart1 = alt.Chart(df_upperA).mark_rect().encode(
    alt.Y("pdf_letters:O",sort=sort_pdf,title=pdf_title),
    alt.X("ocr_letters:O",sort=sort_ocr,title=ocr_title),
    color=color
).properties(
    height=height,
    width=width
)

chart_hists = []
for a in alphas_hist:
    #dfin_larger = df_upperA_large.loc[df_upperA_large['pdf_letters'] == a]
    dfin_larger = df_upperA.loc[df_upperA['pdf_letters'] == a]
    chart2 = alt.Chart(dfin_larger,title="GT Letter = " + a).mark_bar().transform_filter(
       alt.FieldRangePredicate(field=percent_column, range=[100, min_percent])
    ).encode(
        alt.X('ocr_letters:O', sort='-y',title=ocr_title),
        alt.Y("% of all OCR tokens:Q", scale=alt.Scale(type='log',domain=[min_percent,100]), title='Percent in %')
    ).properties(
        width=hist_width,
        height=hist_height
    )
    chart_hists.append(chart2)

chart_hists_all = alt.vconcat(*chart_hists, center=True)

chart = alt.hconcat(chart1,chart_hists_all,center=True)
    
chart.configure_axis(
    labelFontSize=20,
    titleFontSize=20
)

In [193]:
#df_upperA.loc[df_upperA['pdf_letters']=='K']

## Full alphas, with interactivity

In [104]:
alphas_extend_pdf = alphas.copy()
alphas_extend_pdf.extend(['@'])
alphas_extend_ocr = alphas.copy()
alphas_extend_ocr.extend(['^'])

In [105]:
df_alphas = df_char.loc[(df_char['pdf_letters'].isin(alphas_extend_pdf))&(df_char['ocr_letters'].isin(alphas_extend_ocr))]
df_alphas_large = df_char.loc[df_char['pdf_letters'].isin(alphas_extend_pdf)] # keep all OCR letters
# mostly for formatting
df_alphas = subset_by_percent(df_alphas.copy(), tol_count = 0) # formatting
df_alphas_large = subset_by_percent(df_alphas_large.copy(), tol_count = 100) # formatting

shape of output= (2615, 5)
shape of output= (2135, 5)


In [106]:
#df_alphas_large['pdf_letters'].unique()

In [107]:
chart_alphas = return_matrix_chart_withHist(df_alphas, 
                                                df_alphas_large, 
                                                pdf_title='Ground-truth Characters', 
                                                ocr_title='OCR Characters', 
                                                height=500, width=500, 
                                                hist_width=400,
                                               hist_labelFontSize=16,
                                               min_percent=0.01)

In [108]:
chart_alphas

In [109]:
chart_alphas.save(myJekyllDir + 'alphas.json')

## Digits

In [110]:
digits = np.arange(0,10,1).astype('int').astype('str').tolist()
digits_extend_pdf = digits.copy()
digits_extend_pdf.extend(['@'])
digits_extend_ocr = digits.copy()
digits_extend_ocr.extend(['^'])

In [111]:
df_digits = df_char.loc[(df_char['pdf_letters'].isin(digits_extend_pdf)) & (df_char['ocr_letters'].isin(digits_extend_ocr))]
df_digits_large = df_char.loc[df_char['pdf_letters'].isin(digits_extend_pdf)] # keep all OCR letters
# mostly for formatting
df_digits = subset_by_percent(df_digits.copy(), tol_count = 0) # formatting
df_digits_large = subset_by_percent(df_digits_large.copy(), tol_count = 10) # formatting
df_digits.head()

shape of output= (121, 5)
shape of output= (1005, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,3,3,93.725501,170721.0,93.73%
1,3,INSERT,2.481471,4520.0,2.48%
2,3,2,0.658249,1199.0,0.66%
3,3,9,0.181718,331.0,0.18%
4,3,8,0.413945,754.0,0.41%


In [115]:
chart_digits = return_matrix_chart_withHist(df_digits, 
                                                df_digits_large, 
                                                pdf_title='Ground-truth Characters', 
                                                ocr_title='OCR Characters', 
                                                height=500, width=500, 
                                                hist_width=400,
                                               hist_labelFontSize=16,
                                               min_percent=0.1,
                                           scheme='viridis')

In [116]:
chart_digits

In [117]:
chart_digits.save(myJekyllDir + 'digits.json')

## Punctuation

In [169]:
punctuation = list(string.punctuation)
# pop out our markers
for p in ['^','@']:
    try:
        i = punctuation.index(p)
        punctuation.pop(i)
    except:
        pass
punctuation_extend_pdf = punctuation.copy()
punctuation_extend_pdf.extend(['@'])
punctuation_extend_ocr = punctuation.copy()
punctuation_extend_ocr.extend(['^'])

In [170]:
df_punctuation = df_char.loc[(df_char['pdf_letters'].isin(punctuation_extend_pdf)) & (df_char['ocr_letters'].isin(punctuation_extend_ocr))]
df_punctuation_large = df_char.loc[df_char['pdf_letters'].isin(punctuation_extend_pdf)] # keep all OCR letters
# mostly for formatting
df_punctuation = subset_by_percent(df_punctuation.copy(), tol_count = 0) # formatting
df_punctuation_large = subset_by_percent(df_punctuation_large.copy(), tol_count = 10) # formatting
df_punctuation.head()

shape of output= (519, 5)
shape of output= (1586, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,.,.,95.470477,1694790.0,95.47%
1,.,INSERT,2.224034,39481.0,2.22%
2,.,:,0.021237,377.0,0.02%
3,.,",",0.592272,10514.0,0.59%
4,.,&,0.004056,72.0,0.0%


In [173]:
chart_punctuation = return_matrix_chart_withHist(df_punctuation, 
                                                df_punctuation_large, 
                                                pdf_title='Ground-truth Characters', 
                                                ocr_title='OCR Characters', 
                                                height=500, width=500, 
                                                hist_width=400,
                                               hist_labelFontSize=16,
                                               min_percent=0.1,
                                           scheme='viridis')

In [174]:
chart_punctuation

In [175]:
chart_punctuation.save(myJekyllDir + 'punctuation.json')

## Others

Finally, all of the other characters that are not listed in one of the types above.

In [179]:
not_others = punctuation.copy()
not_others.extend(alphas)
not_others.extend(digits)
others = []
for c in df_char['pdf_letters'].values:
    if c not in not_others:
        others.append(c)
others = np.unique(others).tolist()

# pop out our markers
for p in ['^','@']:
    try:
        i = others.index(p)
        others.pop(i)
    except:
        pass

In [180]:
len(others)

151

In [181]:
others_extend_pdf = others.copy()
others_extend_pdf.extend(['@'])
others_extend_ocr = others.copy()
others_extend_ocr.extend(['^'])

In [182]:
df_others = df_char.loc[(df_char['pdf_letters'].isin(others_extend_pdf)) & (df_char['ocr_letters'].isin(others_extend_ocr))]
df_others_large = df_char.loc[df_char['pdf_letters'].isin(others_extend_pdf)] # keep all OCR letters
# mostly for formatting
df_others = subset_by_percent(df_others.copy(), tol_count = 0) # formatting
df_others_large = subset_by_percent(df_others_large.copy(), tol_count = 10) # formatting
df_others.head()

shape of output= (306, 5)
shape of output= (795, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,,,97.813172,34000593.0,97.81%
1,,INSERT,1.543819,536643.0,1.54%
2,,ω,0.000181,63.0,0.0%
3,,—,0.006881,2392.0,0.01%
4,,”,0.000538,187.0,0.0%


In [185]:
chart_others = return_matrix_chart_withHist(df_others, 
                                                df_others_large, 
                                                pdf_title='Ground-truth Characters', 
                                                ocr_title='OCR Characters', 
                                                height=600, width=500, 
                                                hist_width=400,
                                               hist_labelFontSize=16,
                                               min_percent=0.1,
                                           scheme='viridis')

In [186]:
chart_others

This isn't super illuminating, so leaving it off the summary.

# Word-level matches

In [197]:
# normalize
normalized=True
pdf_letters = []; ocr_letters = []; counts = []; counts_unnormed = []
for pl,ols in words.items():
    if normalized:
        cdiv=0.0
        for ol,c in ols.items(): # % in OCR
            cdiv += c
    else:
        cdiv = 1.0
        
    for ol,c in ols.items():
        pdf_letters.append(pl)
        ocr_letters.append(ol)
        counts.append(c/cdiv*100)
        counts_unnormed.append(c)
        
df_words = pd.DataFrame({'pdf_letters':pdf_letters,
                        'ocr_letters':ocr_letters,
                        'counts':counts, 'counts unnormalized':counts_unnormed})

In [198]:
# also cleaned words
normalized=True
pdf_letters = []; ocr_letters = []; counts = []; counts_unnormed = []
for pl,ols in words_cleaned.items():
    if normalized:
        cdiv=0.0
        for ol,c in ols.items(): # % in OCR
            cdiv += c
    else:
        cdiv = 1.0
        
    for ol,c in ols.items():
        pdf_letters.append(pl)
        ocr_letters.append(ol)
        counts.append(c/cdiv*100)
        counts_unnormed.append(c)
        
df_words_clean = pd.DataFrame({'pdf_letters':pdf_letters,
                        'ocr_letters':ocr_letters,
                        'counts':counts, 'counts unnormalized':counts_unnormed})

In [199]:
df_words_clean.head()

Unnamed: 0,pdf_letters,ocr_letters,counts,counts unnormalized
0,rvations,obserrvations,100.0,1
1,at,at,91.589144,193167
2,at,,1.289674,2720
3,at,luam,0.000474,1
4,at,the,0.016595,35


## Stop words

In [200]:
from nltk.corpus import stopwords

In [222]:
# lets look for stopwords
stop_words_all = stopwords.words('English')
# take out single characters
stop_words = []
for s in stop_words_all:
    if len(s) > 1:
        stop_words.append(s)

In [223]:
len(stop_words)

171

In [230]:
df_words_stop = df_words_clean.loc[(df_words_clean['pdf_letters'].isin(stop_words)) & (df_words_clean['ocr_letters'].isin(stop_words))]
df_words_stop_large = df_words_clean.loc[df_words_clean['pdf_letters'].isin(stop_words)] # keep all OCR letters
# cut off lower-counted things
tol_count = 5
df_words_stop = subset_by_percent(df_words_stop.copy(), tol_count = tol_count) # formatting
df_words_stop_large = subset_by_percent(df_words_stop_large.copy(), tol_count = tol_count) # formatting
df_words_stop_large.head()

shape of output= (441, 5)
shape of output= (4262, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,at,at,91.589144,193167.0,91.59%
1,at,,1.289674,2720.0,1.29%
2,at,the,0.016595,35.0,0.02%
3,at,al,3.917859,8263.0,3.92%
4,at,t,0.018492,39.0,0.02%


In [240]:
# debug
from importlib import reload
import utils
reload(utils)

from utils import subset_by_percent, return_matrix_chart_withHist

In [281]:
chart_stopwords = return_matrix_chart_withHist(df_words_stop, 
                                                df_words_stop_large, 
                                                pdf_title='Ground-truth Stop-words', 
                                                ocr_title='OCR Stop-words', 
                                                height=700, width=700, 
                                                hist_width=300,
                                               hist_labelFontSize=16,
                                               min_percent=0.1,
                                               scheme='viridis', 
                                              titleFontSize=20,
                                              labelFontSize=8, 
                                              log=False)

In [282]:
chart_stopwords

In [252]:
chart_stopwords.save(myJekyllDir + 'stopwords.json')

## Most frequent words that are not stopwords

In [253]:
not_freq_words = []
not_freq_words.extend(stop_words_all)
not_freq_words.extend(digits)
not_freq_words.extend(alphas)
not_freq_words.extend(punctuation)

In [268]:
df_freq = df_words_clean.loc[~((df_words_clean['pdf_letters'].isin(not_freq_words)) & (df_words_clean['ocr_letters'].isin(not_freq_words)))]
df_freq_large = df_words_clean.loc[~(df_words_clean['pdf_letters'].isin(not_freq_words))]
len(df_freq), len(df_freq_large)

(896509, 826595)

In [276]:
# how many top words?
ntop = 100
pdf_words = df_freq_large.sort_values('counts unnormalized',ascending=False).iloc[:ntop]['pdf_letters'].values

df_freq_top = df_words_clean.loc[(df_words_clean['pdf_letters'].isin(pdf_words))&(df_words_clean['ocr_letters'].isin(pdf_words))]
df_freq_top_large = df_words_clean.loc[df_words_clean['pdf_letters'].isin(pdf_words)]
print(len(df_freq_top),len(df_freq_top_large))

# cut off lower-counted things
tol_count = 5
df_freq_top = subset_by_percent(df_freq_top.copy(), tol_count = tol_count) # formatting
df_freq_top_large = subset_by_percent(df_freq_top_large.copy(), tol_count = tol_count) # formatting
df_freq_top_large.head()

540 40196
shape of output= (115, 5)
shape of output= (3866, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,high,high,85.028696,34372.0,85.03%
1,high,hieh,5.707006,2307.0,5.71%
2,high,hel,0.178112,72.0,0.18%
3,high,lieh,1.058777,428.0,1.06%
4,high,heh,0.257273,104.0,0.26%


In [308]:
chart_freq = return_matrix_chart_withHist(df_freq_top, 
                                                df_freq_top_large, 
                                                pdf_title='Ground-truth Most Frequent Words', 
                                                ocr_title='OCR Words', 
                                                height=600, width=600, 
                                                hist_width=300,
                                               hist_labelFontSize=16,
                                               min_percent=0.1,
                                               scheme='viridis', 
                                              titleFontSize=20,
                                              labelFontSize=8,
                                         log=False)

In [309]:
chart_freq

In [310]:
chart_freq.save(myJekyllDir + 'most_freq_nonstop.json')

## Most frequently mis-spelled words

In [287]:
df_miss_large = df_words_clean.loc[~(df_words_clean['pdf_letters'].isin(not_freq_words))]

# first find which words are == to OCR words, less than some cut off
per_cut = 90.0 # less than

# ones that are misspelled the most
df_word_wrong_words = df_miss_large.loc[(df_miss_large['pdf_letters'] == df_miss_large['ocr_letters'])&(df_miss_large['counts']<per_cut)]['pdf_letters'].unique()
# get all of them
df_word_wrong = df_miss_large.loc[df_miss_large['pdf_letters'].isin(df_word_wrong_words)]

In [289]:
df_word_wrong['pdf_letters'].nunique()

49268

In [299]:
ncut = 10000 # how many words in the dataset as a whole?
ntop_freq = 100
df_miss_subset = df_miss_large[df_miss_large['counts unnormalized']>=ncut]
pdf_words = df_miss_subset.loc[df_miss_subset['pdf_letters']==df_miss_subset['ocr_letters']].sort_values('counts')[:ntop_freq]['pdf_letters'].values

In [297]:
df_words_clean['pdf_letters'].nunique()

195995

In [302]:
df_freq_wrong = df_words_clean.loc[(df_words_clean['pdf_letters'].isin(pdf_words)) & (df_words_clean['ocr_letters'].isin(pdf_words))]
df_freq_wrong_large = df_words_clean.loc[df_words_clean['pdf_letters'].isin(pdf_words)]

tol_count = 5
df_freq_wrong = subset_by_percent(df_freq_wrong.copy(), tol_count = tol_count) # formatting
df_freq_wrong_large = subset_by_percent(df_freq_wrong_large.copy(), tol_count = tol_count) # formatting

shape of output= (107, 5)
shape of output= (4648, 5)


In [311]:
chart_freq_wrong = return_matrix_chart_withHist(df_freq_wrong, 
                                                df_freq_wrong_large, 
                                                pdf_title='Ground-truth Frequently Misspelled', 
                                                ocr_title='OCR Words', 
                                                height=600, width=600, 
                                                hist_width=300,
                                               hist_labelFontSize=16,
                                               min_percent=0.1,
                                               scheme='viridis', 
                                              titleFontSize=20,
                                              labelFontSize=8,
                                         log=False)

In [312]:
chart_freq_wrong

In [313]:
chart_freq_wrong.save(myJekyllDir + 'freqmiss.json')

In [None]:
**here --> make some dropdowns!!**