This notebook makes some visualizations, both static and interactive, with [Altair](https://altair-viz.github.io/).

Make sure Altair is installed:

```python
!pip install altair vega_datasets
```

In [93]:
# if you want to save these altair plots for hosting in a Jekyll directory, make sure to set this directory
myJekyllDir = '/Users/jnaiman/ReadingTimeMachine.github.io/assets/json/'

In [94]:
import altair as alt
import string
import pickle
import pandas as pd
import numpy as np

# debug
from importlib import reload
import utils
reload(utils)

from utils import subset_by_percent, return_matrix_chart_withHist

In [95]:
data_dir = './data/'

## Data

In [96]:
with open(data_dir+'letters.pickle','rb') as f:
    letters = pickle.load(f)
with open(data_dir+'words.pickle','rb') as f:
    words = pickle.load(f)

Format characters for plotting:

In [97]:
# matrix
normalized=True
pdf_letters = []; ocr_letters = []; counts = []
counts_un = []
for pl,ols in letters.items():
    if normalized:
        cdiv=0.0
        for ol,c in ols.items(): # % in OCR
            cdiv += c
    else:
        cdiv = 1.0
        
    for ol,c in ols.items():
        pdf_letters.append(pl)
        ocr_letters.append(ol)
        counts.append(c/cdiv*100)
        counts_un.append(c)
        
df_char = pd.DataFrame({'pdf_letters':pdf_letters,
                        'ocr_letters':ocr_letters,
                        'counts':counts, 
                       'counts unnormalized':counts_un})
# save
###df_char.to_csv(char_counts_df_file, index=False)

In [98]:
df_char.head()

Unnamed: 0,pdf_letters,ocr_letters,counts,counts unnormalized
0,r,r,98.29838,9979050
1,r,^,0.753827,76527
2,r,7,0.000493,50
3,r,u,0.060354,6127
4,r,x,0.089462,9082


# Characters

## Static, just upper-case alphabetic characters (in paper)

In [99]:
# get upper case alpha characters
alphas = list(string.ascii_lowercase)
alphas_lower = alphas.copy()
# add larger ones
for a in alphas_lower:
    alphas.append(a.upper())

upperAlphas = alphas[26:]
print(upperAlphas)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [100]:
df_upperA = df_char.loc[(df_char['pdf_letters'].isin(upperAlphas))&(df_char['ocr_letters'].isin(upperAlphas))]
# mostly for formatting
df_upperA = subset_by_percent(df_upperA.copy(), tol_count = 0) # formatting
df_upperA.head()

shape of output= (576, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,A,A,96.392449,311925.0,96.39%
1,A,X,0.58282,1886.0,0.58%
2,A,C,0.017614,57.0,0.02%
3,A,I,0.014215,46.0,0.01%
4,A,L,0.040482,131.0,0.04%


In [101]:
alphas_hist = ['A', 'K'] # letters to histogram

# other params
min_percent = 0.01 # only plot things that are at this level (in %) or above in terms of frequency in dataset
percent_column = 'counts' # counts or unnormalized?
scheme = 'viridis'
height=400 # matrix
width=400 # matrix
log=True # log of colormap?
color_title = 'Percent in %'
percent_column = "% of all OCR tokens"
count_column = "Total Count of PDF token"
pdf_title='PDF Characters'
ocr_title='OCR Characters'

#percent_column = "% of all OCR tokens"
hist_width = 400

In [102]:
hist_height = height // (len(alphas_hist)+1)

sort_pdf = np.unique(df_upperA['pdf_letters']).tolist()
sort_pdf.sort()
extra_ocr = []
for o in df_upperA['ocr_letters'].unique():
    if o not in sort_pdf:
        extra_ocr.append(o)

extra_ocr.sort()

sort_ocr = sort_pdf.copy()
sort_ocr.extend(extra_ocr)

# legend placement
length = 200
legendY = -50
legendX = 0 + width//2 - length//2

if log:
    color = alt.Color(percent_column+":Q", scale=alt.Scale(type='log',scheme=scheme, domain=[min_percent, 100]),
                      title=color_title, legend=alt.Legend(
                        orient='none',
                        legendX=legendX, legendY=legendY,
                        direction='horizontal',
                        titleAnchor='middle', gradientLength=length))
else:
    color = alt.Color(percent_column+"value:Q", scale=alt.Scale(scheme=scheme,domain=[min_percent, 100]),
                      title=color_title)

chart1 = alt.Chart(df_upperA).mark_rect().encode(
    alt.Y("pdf_letters:O",sort=sort_pdf,title=pdf_title),
    alt.X("ocr_letters:O",sort=sort_ocr,title=ocr_title),
    color=color
).properties(
    height=height,
    width=width
)

chart_hists = []
for a in alphas_hist:
    #dfin_larger = df_upperA_large.loc[df_upperA_large['pdf_letters'] == a]
    dfin_larger = df_upperA.loc[df_upperA['pdf_letters'] == a]
    chart2 = alt.Chart(dfin_larger,title="GT Letter = " + a).mark_bar().transform_filter(
       alt.FieldRangePredicate(field=percent_column, range=[100, min_percent])
    ).encode(
        alt.X('ocr_letters:O', sort='-y',title=ocr_title),
        alt.Y("% of all OCR tokens:Q", scale=alt.Scale(type='log',domain=[min_percent,100]), title='% Correct')
    ).properties(
        width=hist_width,
        height=hist_height
    )
    chart_hists.append(chart2)

chart_hists_all = alt.vconcat(*chart_hists, center=True)

chart = alt.hconcat(chart1,chart_hists_all,center=True)
    
chart.configure_axis(
    labelFontSize=20,
    titleFontSize=20
)

In [103]:
#df_upperA.loc[df_upperA['pdf_letters']=='K']

## Full alphas, with interactivity

In [104]:
alphas_extend_pdf = alphas.copy()
alphas_extend_pdf.extend(['@'])
alphas_extend_ocr = alphas.copy()
alphas_extend_ocr.extend(['^'])

In [105]:
df_alphas = df_char.loc[(df_char['pdf_letters'].isin(alphas_extend_pdf))&(df_char['ocr_letters'].isin(alphas_extend_ocr))]
df_alphas_large = df_char.loc[df_char['pdf_letters'].isin(alphas_extend_pdf)] # keep all OCR letters
# mostly for formatting
df_alphas = subset_by_percent(df_alphas.copy(), tol_count = 0) # formatting
df_alphas_large = subset_by_percent(df_alphas_large.copy(), tol_count = 100) # formatting

shape of output= (2615, 5)
shape of output= (2135, 5)


In [106]:
#df_alphas_large['pdf_letters'].unique()

In [107]:
chart_alphas = return_matrix_chart_withHist(df_alphas, 
                                                df_alphas_large, 
                                                pdf_title='Ground-truth Characters', 
                                                ocr_title='OCR Characters', 
                                                height=500, width=500, 
                                                hist_width=400,
                                               hist_labelFontSize=16,
                                               min_percent=0.01)

In [108]:
chart_alphas

In [109]:
chart_alphas.save(myJekyllDir + 'alphas.json')

## Digits

In [110]:
digits = np.arange(0,10,1).astype('int').astype('str').tolist()
digits_extend_pdf = digits.copy()
digits_extend_pdf.extend(['@'])
digits_extend_ocr = digits.copy()
digits_extend_ocr.extend(['^'])

In [111]:
df_digits = df_char.loc[(df_char['pdf_letters'].isin(digits_extend_pdf)) & (df_char['ocr_letters'].isin(digits_extend_ocr))]
df_digits_large = df_char.loc[df_char['pdf_letters'].isin(digits_extend_pdf)] # keep all OCR letters
# mostly for formatting
df_digits = subset_by_percent(df_digits.copy(), tol_count = 0) # formatting
df_digits_large = subset_by_percent(df_digits_large.copy(), tol_count = 10) # formatting
df_digits.head()

shape of output= (121, 5)
shape of output= (1005, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,3,3,93.725501,170721.0,93.73%
1,3,INSERT,2.481471,4520.0,2.48%
2,3,2,0.658249,1199.0,0.66%
3,3,9,0.181718,331.0,0.18%
4,3,8,0.413945,754.0,0.41%


In [115]:
chart_digits = return_matrix_chart_withHist(df_digits, 
                                                df_digits_large, 
                                                pdf_title='Ground-truth Characters', 
                                                ocr_title='OCR Characters', 
                                                height=500, width=500, 
                                                hist_width=400,
                                               hist_labelFontSize=16,
                                               min_percent=0.1,
                                           scheme='viridis')

In [116]:
chart_digits

In [117]:
chart_digits.save(myJekyllDir + 'digits.json')