This notebook makes some visualizations, both static and interactive, with [Altair](https://altair-viz.github.io/).

Make sure Altair is installed:

```python
!pip install altair vega_datasets
```

In [99]:
# if you want to save these altair plots for hosting in a Jekyll directory, make sure to set this directory
myJekyllDir = '/Users/jnaiman/ReadingTimeMachine.github.io/assets/json/'

In [105]:
import altair as alt
import string
import pickle
import pandas as pd
import numpy as np

# debug
from importlib import reload
import utils
reload(utils)

from utils import subset_by_percent, return_matrix_chart_withHist

In [8]:
data_dir = './data/'

## Data

In [9]:
with open(data_dir+'letters.pickle','rb') as f:
    letters = pickle.load(f)
with open(data_dir+'words.pickle','rb') as f:
    words = pickle.load(f)

Format characters for plotting:

In [13]:
# matrix
normalized=True
pdf_letters = []; ocr_letters = []; counts = []
counts_un = []
for pl,ols in letters.items():
    if normalized:
        cdiv=0.0
        for ol,c in ols.items(): # % in OCR
            cdiv += c
    else:
        cdiv = 1.0
        
    for ol,c in ols.items():
        pdf_letters.append(pl)
        ocr_letters.append(ol)
        counts.append(c/cdiv*100)
        counts_un.append(c)
        
df_char = pd.DataFrame({'pdf_letters':pdf_letters,
                        'ocr_letters':ocr_letters,
                        'counts':counts, 
                       'counts unnormalized':counts_un})
# save
###df_char.to_csv(char_counts_df_file, index=False)

In [14]:
df_char.head()

Unnamed: 0,pdf_letters,ocr_letters,counts,counts unnormalized
0,r,r,98.29838,9979050
1,r,^,0.753827,76527
2,r,7,0.000493,50
3,r,u,0.060354,6127
4,r,x,0.089462,9082


## Static, just upper-case alphabetic characters (in paper)

In [4]:
# get upper case alpha characters
alphas = list(string.ascii_lowercase)
alphas_lower = alphas.copy()
# add larger ones
for a in alphas_lower:
    alphas.append(a.upper())

upperAlphas = alphas[26:]
print(upperAlphas)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [25]:
df_upperA = df_char.loc[(df_char['pdf_letters'].isin(upperAlphas))&(df_char['ocr_letters'].isin(upperAlphas))]
# mostly for formatting
df_upperA = subset_by_percent(df_upperA.copy(), tol_count = 0) # formatting
df_upperA.head()

shape of output= (576, 5)


Unnamed: 0,pdf_letters,ocr_letters,% of all OCR tokens,Total Count of PDF token,name
0,A,A,96.392449,311925,96.39%
1,A,X,0.58282,1886,0.58%
2,A,C,0.017614,57,0.02%
3,A,I,0.014215,46,0.01%
4,A,L,0.040482,131,0.04%


In [26]:
alphas_hist = ['A', 'K'] # letters to histogram

# other params
min_percent = 0.01 # only plot things that are at this level (in %) or above in terms of frequency in dataset
percent_column = 'counts' # counts or unnormalized?
scheme = 'viridis'
height=400 # matrix
width=400 # matrix
log=True # log of colormap?
color_title = 'Percent in %'
percent_column = "% of all OCR tokens"
count_column = "Total Count of PDF token"
pdf_title='PDF Characters'
ocr_title='OCR Characters'

#percent_column = "% of all OCR tokens"
hist_width = 400

In [30]:
hist_height = height // (len(alphas_hist)+1)

sort_pdf = np.unique(df_upperA['pdf_letters']).tolist()
sort_pdf.sort()
extra_ocr = []
for o in df_upperA['ocr_letters'].unique():
    if o not in sort_pdf:
        extra_ocr.append(o)

extra_ocr.sort()

sort_ocr = sort_pdf.copy()
sort_ocr.extend(extra_ocr)

# legend placement
length = 200
legendY = -50
legendX = 0 + width//2 - length//2

if log:
    color = alt.Color(percent_column+":Q", scale=alt.Scale(type='log',scheme=scheme, domain=[min_percent, 100]),
                      title=color_title, legend=alt.Legend(
                        orient='none',
                        legendX=legendX, legendY=legendY,
                        direction='horizontal',
                        titleAnchor='middle', gradientLength=length))
else:
    color = alt.Color(percent_column+"value:Q", scale=alt.Scale(scheme=scheme,domain=[min_percent, 100]),
                      title=color_title)

chart1 = alt.Chart(df_upperA).mark_rect().encode(
    alt.Y("pdf_letters:O",sort=sort_pdf,title=pdf_title),
    alt.X("ocr_letters:O",sort=sort_ocr,title=ocr_title),
    color=color
).properties(
    height=height,
    width=width
)

chart_hists = []
for a in alphas_hist:
    dfin_larger = df_upperA.loc[df_upperA['pdf_letters'] == a]
    chart2 = alt.Chart(dfin_larger,title="GT Letter = " + a).mark_bar().transform_filter(
       alt.FieldRangePredicate(field=percent_column, range=[100, min_percent])
    ).encode(
        alt.X('ocr_letters:O', sort='-y',title=ocr_title),
        alt.Y("% of all OCR tokens:Q", scale=alt.Scale(type='log',domain=[min_percent,100]), title='% Correct')
    ).properties(
        width=hist_width,
        height=hist_height
    )
    chart_hists.append(chart2)

chart_hists_all = alt.vconcat(*chart_hists, center=True)

chart = alt.hconcat(chart1,chart_hists_all,center=True)
    
chart.configure_axis(
    labelFontSize=20,
    titleFontSize=20
)

## Full alphas, with interactivity

In [102]:
# # function for plots -- with histogramsmin_percent
# def return_matrix_chart_withHist(dfin,  dfin_larger, textsize=20, stroke='black', 
#                         height=800, width=900, scheme='viridis', 
#                        log=True, color_title = 'Percent in %',
#                        pdf_tag = 'PDF', ocr_tag = 'OCR',
#                        return_sort_ocr=False,
#                        percent_column = "% of all OCR tokens",
#                        count_column = "Total Count of PDF token",
#                        pdf_title='PDF Characters', ocr_title='OCR Characters',
#                                 hist_width=800, min_percent = 1.0, hist_labelFontSize=16, 
#                                 hist_location = 'right', 
#                             legend_length = 200, legend_Y = -50,legend_direction='horizontal',
#                                 color_selection = False):
    
#     # for colormap legend
#     # legend placement
#     length = legend_length
#     legendY = legend_Y
#     legendX = 0 + width//2 - length//2

    
#     sort_pdf = np.unique(dfin['pdf_letters']).tolist()
#     sort_pdf.sort()
#     extra_ocr = []
#     for o in dfin['ocr_letters'].unique():
#         if o not in sort_pdf:
#             extra_ocr.append(o)

#     extra_ocr.sort()

#     sort_ocr = sort_pdf.copy()
#     sort_ocr.extend(extra_ocr)
    
#     # check special characters:
#     for i in range(len(sort_pdf)):
#         if '\\' in repr(sort_pdf[i]): # have escaped
#             sort_pdf[i] = re.escape(repr(sort_pdf[i]))
        
#     for i in range(len(sort_ocr)):
#         if '\\' in repr(sort_ocr[i]): # have escaped
#             sort_ocr[i] = re.escape(repr(sort_ocr[i]))
            
#     # also clean dataframe
#     for i in range(len(dfin)):
#         for c in ['pdf_letters','ocr_letters']:
#             esc = repr(dfin.iloc[i][c])
#             if '\\' in esc: # have escaped
#                 dfin.at[i,c]=re.escape(esc)
                    
#     # maybe some special words?
#     for iss,s in enumerate(sort_ocr):
#         if 'if' == s:
#             #sort_ocr[iss] = "''if"
#             sort_ocr[iss] = '"if"' #str('if')
#             #sort_ocr[iss] = u'if'
#     # maybe some special words?
#     for iss,s in enumerate(sort_pdf):
#         if 'if' == s:
#             #sort_pdf[iss] = "''if"
#             sort_pdf[iss] = '"if"' #str('if')
#             #sort_pdf[iss] = u'if'


#     if color_selection:
#         column_select = alt.selection_point(fields=['column'],
#                                      bind=alt.binding_select(options=[percent_column, 
#                                                                       count_column], 
#                                                              name='Color by: '),
#                                      value=percent_column)
#         color_col = 'value'
#     else:
#         color_col = percent_column

#     selector = alt.selection_point(encodings=['y'])#, init={pdf_letters:'A'})
#     opacity = alt.condition(selector,alt.value(1),alt.value(0.25))


#     if log:
#         if not color_selection:
#             color = alt.Color(color_col+":Q", scale=alt.Scale(type='log',scheme=scheme,domain=[min_percent,100]),title=color_title,
#                               legend=alt.Legend(
#                             orient='none',
#                             legendX=legendX, legendY=legendY,
#                             direction=legend_direction,
#                             titleAnchor='middle', gradientLength=length))
#         else:
#             color = alt.Color(color_col+":Q", scale=alt.Scale(type='log',scheme=scheme),title=color_title,
#                               legend=alt.Legend(
#                             orient='none',
#                             legendX=legendX, legendY=legendY,
#                             direction=legend_direction,
#                             titleAnchor='middle', gradientLength=length))
#     else:
#         if not color_selection:
#             color = alt.Color(color_col+":Q", scale=alt.Scale(scheme=scheme,domain=[min_percent,100]),title=color_title,
#                               legend=alt.Legend(
#                             orient='none',
#                             legendX=legendX, legendY=legendY,
#                             direction=legend_direction,
#                             titleAnchor='middle', gradientLength=length))
#         else:
#             color = alt.Color(color_col+":Q", scale=alt.Scale(scheme=scheme),title=color_title,
#                               legend=alt.Legend(
#                             orient='none',
#                             legendX=legendX, legendY=legendY,
#                             direction=legend_direction,
#                             titleAnchor='middle', gradientLength=length))
        
#     if not color_selection:
#         chart1 = alt.Chart(dfin).mark_rect().transform_fold(
#             fold=[percent_column, count_column],
#             as_=['column', 'value']
#         ).encode(
#             alt.Y("pdf_letters:O",sort=sort_pdf,title=pdf_title),
#             alt.X("ocr_letters:O",sort=sort_ocr,title=ocr_title),
#             color=color,
#             opacity=opacity,
#             tooltip=[alt.Tooltip("pdf_letters:O",title=pdf_tag), 
#                      alt.Tooltip("ocr_letters:O",title=ocr_tag), 
#                      alt.Tooltip("name:N",title='Percentage'),
#                     alt.Tooltip(count_column+':Q',title='Count')]
#         ).properties(
#             height=height,
#             width=width
#         ).add_params(
#             selector
#         )

#     else:
#         chart1 = alt.Chart(dfin).mark_rect().transform_fold(
#             fold=[percent_column, count_column],
#             as_=['column', 'value']
#         ).transform_filter(
#             column_select
#         ).encode(
#             alt.Y("pdf_letters:O",sort=sort_pdf,title=pdf_title),
#             alt.X("ocr_letters:O",sort=sort_ocr,title=ocr_title),
#             color=color,
#             opacity=opacity,
#             tooltip=[alt.Tooltip("pdf_letters:O",title=pdf_tag), 
#                      alt.Tooltip("ocr_letters:O",title=ocr_tag), 
#                      alt.Tooltip("name:N",title='Percentage'),
#                     alt.Tooltip(count_column+':Q',title='Count')]
#         ).properties(
#             height=height,
#             width=width
#         ).add_params(
#             selector,
#             column_select
#         )
        
#     chart2 = alt.Chart(dfin_larger).mark_bar().transform_filter(
#         selector
#     ).transform_filter(
#        alt.FieldRangePredicate(field=percent_column, range=[100, min_percent])
#        #alt.FieldRangePredicate(field=percent_column, range=[100, slider])
#     ).encode(
#         alt.X('ocr_letters:O', sort='-y',title=ocr_title),#,labelFontSize=hist_labelFontSize),
#         alt.Y("% of all OCR tokens:Q"),#, 
#             tooltip=[alt.Tooltip("pdf_letters:O",title=pdf_tag), 
#                  alt.Tooltip("ocr_letters:O",title=ocr_tag), 
#                  alt.Tooltip("name:N",title='Percentage'),
#                 alt.Tooltip(count_column+':Q',title='Count')]

#     ).properties(
#         width=hist_width
#     )

#     if hist_location == 'bottom':
#         chart = alt.vconcat(chart1, chart2, center=True)
#     elif hist_location == 'right':
#         chart = alt.hconcat(chart1,chart2,center=True)
#     else:
#         print('not supported location for hist, will place on right')
#         chart = alt.hconcat(chart1,chart2,center=True)

#     if return_sort_ocr:
#         return chart, sort_ocr
#     return chart,chart1

In [106]:
df_alphas = df_char.loc[(df_char['pdf_letters'].isin(alphas))&(df_char['ocr_letters'].isin(alphas))]
# mostly for formatting
df_alphas = subset_by_percent(df_alphas.copy(), tol_count = 0) # formatting

shape of output= (2510, 5)


In [108]:
chart_alphas = return_matrix_chart_withHist(df_alphas, 
                                                df_alphas, 
                                                pdf_title='PDF Words', 
                                                ocr_title='OCR Words', 
                                                height=500, width=500, 
                                                hist_width=400,
                                               hist_labelFontSize=16,
                                               min_percent=0.01)

In [109]:
chart_alphas

In [110]:
chart_alphas.save(myJekyllDir + 'alphas.json')