In [97]:
%matplotlib inline

import json, os, sys, re, math
import numpy as np
import pandas as pd
import cv2  
from shutil import copy
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [98]:
path = '../data/CNNTest/'
train_bbs = json.load(open(path+'train/bounding_box_data.json','r'))

In [99]:
papers = {}
papers_df = pd.DataFrame()
i = 0
for paper_page in train_bbs:
    paper = paper_page.split('-')[0]
    page = f"page{paper_page.replace('.png','').split('-')[-1]}"
    if paper in papers:
        papers[paper]['pages_count'] +=1
        
    else:
        papers[paper]={}
        papers[paper]['pages_count']=0
        papers[paper]['pages'] = {}
        
    temp_papers = []
    for region in train_bbs[paper_page]['regions']:
        temp = train_bbs[paper_page]['regions'][region]
        width = temp['shape_attributes']['all_points_x'][2]-temp['shape_attributes']['all_points_x'][0]
        left = temp['shape_attributes']['all_points_x'][0]
        top = temp['shape_attributes']['all_points_y'][0]
        entity_id = temp['shape_attributes']['entity_id']
        height = temp['shape_attributes']['all_points_y'][1]-temp['shape_attributes']['all_points_y'][0]
        papers_df = papers_df.append({'paper':paper,'page':page,'width':width,'height':height, \
                                      'entity_id':entity_id, 'top':top,'left':left}, ignore_index=True)
        temp_papers.append((width, height))
    papers[paper]['pages'][page] = temp_papers

num_papers = 0
num_pages = 0
num_eqns = 0

widths = []
heights = []

eqns_per_page = []
eqns_per_paper = []

pages_per_paper = []

for paper in papers:
    num_papers+=1
    num_pages+= papers[paper]['pages_count']
    pages_per_paper.append(papers[paper]['pages_count'])
    eqns_for_this_paper = 0
    for page in papers[paper]['pages']:
        num_eqns+=len(papers[paper]['pages'][page])
        eqns_per_page.append(len(papers[paper]['pages'][page]))
        eqns_for_this_paper +=len(papers[paper]['pages'][page])
        for eqn in papers[paper]['pages'][page]:
            widths.append(eqn[0])
            heights.append(eqn[1])
        
    eqns_per_paper.append(eqns_for_this_paper)
    

In [100]:
num_papers, num_pages, num_eqns

(323, 5234, 13722)

In [82]:


fig = go.Figure()
# fig.add_trace(go.Histogram(x=widths,bingroup=1,name="Widths"))
# fig.add_trace(go.Histogram(x=heights,bingroup=1,name="Heights"))
fig.add_trace(go.Histogram(x=papers_df["width"],bingroup=1,name="Widths", histnorm="percent"))
fig.add_trace(go.Histogram(x=papers_df["height"],bingroup=1,name="Heights", histnorm="percent"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.65)
fig.update_layout(
    title_x=0.5,
    title_text='Histograms of Equation Heights and Widths', # title of plot
    xaxis_title_text='Pixels', # xaxis label
    yaxis_title_text='% Eqns', # yaxis label
    bargap=0, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()



In [83]:


fig = go.Figure()
# fig.add_trace(go.Histogram(x=widths,bingroup=1,name="Widths"))
# fig.add_trace(go.Histogram(x=heights,bingroup=1,name="Heights"))
fig.add_trace(go.Histogram(x=papers_df["width"],bingroup=1,name="Widths", histnorm="percent",cumulative_enabled=True))
fig.add_trace(go.Histogram(x=papers_df["height"],bingroup=1,name="Heights", histnorm="percent",cumulative_enabled=True))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.65)
fig.update_layout(
    title_x=0.5,
    title_text='Histograms of Equation Heights and Widths', # title of plot
    xaxis_title_text='Pixels', # xaxis label
    yaxis_title_text='% Eqns', # yaxis label
    bargap=0, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()


papers_df['area'] = papers_df['height'] * papers_df['width']

fig = go.Figure()
# fig.add_trace(go.Histogram(x=widths,bingroup=1,name="Widths"))
# fig.add_trace(go.Histogram(x=heights,bingroup=1,name="Heights"))
fig.add_trace(go.Histogram(x=papers_df["area"],bingroup=1,name="Area", histnorm="percent"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.65)
fig.update_layout(
    title_x=0.5,
    title_text='Histograms of Equation Area', # title of plot
    xaxis_title_text='Pixels^2', # xaxis label
    yaxis_title_text='% Eqns', # yaxis label
    bargap=0, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()



In [84]:


fig = go.Figure()
fig.add_trace(go.Histogram(x=eqns_per_page,bingroup=1,name="Eqns_per_page", histnorm="percent",cumulative_enabled=True))
# fig.add_trace(go.Histogram(x=eqns_per_page,bingroup=1,name="Eqns_per_page", histnorm="percent"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.65)
fig.update_layout(
    title_x=0.5,
    title_text='Histograms of Equations per Page', # title of plot
    xaxis_title_text='Eqns', # xaxis label
    yaxis_title_text='% Pages', # yaxis label
    bargap=0, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()



In [85]:
paper_df = papers_df.sort_values(['paper','page','entity_id','top'], ascending=True)
papers_df.head()

Unnamed: 0,entity_id,height,left,page,paper,top,width,area
0,10.0,11.0,361.0,page1,1503.00098,523.0,156.0,1716.0
1,12.0,11.0,388.0,page1,1503.00098,588.0,101.0,1111.0
2,15.0,24.0,368.0,page1,1503.00098,653.0,141.0,3384.0
3,16.0,22.0,383.0,page1,1503.00098,704.0,113.0,2486.0
4,18.0,22.0,156.0,page2,1503.00098,89.0,39.0,858.0


In [86]:
papers_df[papers_df.height<7].sort_values(['height','width'],ascending=False)
#Height <7 is the magic number to merge 

Unnamed: 0,entity_id,height,left,page,paper,top,width,area


In [87]:
papers_df[papers_df.height>=0].sort_values(['height'],ascending=False).head(5)


Unnamed: 0,entity_id,height,left,page,paper,top,width,area
5095,193.0,187.0,62.0,page7,2001.0001,456.0,482.0,90134.0
5125,205.0,167.0,68.0,page12,2001.0001,128.0,505.0,84335.0
7465,1038.0,166.0,85.0,page51,705.00115,263.0,486.0,80676.0
714,659.0,160.0,229.0,page23,1503.00036,427.0,292.0,46720.0
515,788.0,159.0,101.0,page22,705.00042,367.0,409.0,65031.0


In [88]:
# print(sorted(papers_df.width.unique()))
papers_df[(papers_df.width<=10) & (papers_df.height>=7)].sort_values(['paper','page'],ascending=False).head(20)

Unnamed: 0,entity_id,height,left,page,paper,top,width,area


In [89]:
# print(sorted(papers_df.width.unique()))
papers_df[(papers_df.width<=10)].sort_values(['width'],ascending=False)

Unnamed: 0,entity_id,height,left,page,paper,top,width,area


In [90]:
total_eqns = papers_df.groupby(['paper','page']).size()



total_papers = list(set(list(papers_df.paper)))

papers_to_remove = []
papers_to_remove = papers_to_remove + ["1501.00009","1503.00066","0705.00116"] #page orientation
papers_to_remove = papers_to_remove + ["0705.00017"] #Random figures and artifacts as eqns
papers_to_remove = papers_to_remove +  list(set(list(papers_df[(papers_df.width<=10)].paper)))
papers_to_remove = papers_to_remove +  list(set(list(papers_df[(papers_df.height<7)].paper)))


def remove_papers(row):

    if row in papers_to_remove:
        return 1
    else:
        return 0

In [91]:
len(total_papers), len(papers_to_remove)

(215, 4)

In [92]:
total_eqns = papers_df.groupby(['paper','page']).size().reset_index().sort_values(['paper','page'])
total_eqns.columns = ['paper','page','eqns']
new_papers_df = papers_df.merge(total_eqns, on=['paper','page'], how="left")
new_papers_df.head()

Unnamed: 0,entity_id,height,left,page,paper,top,width,area,eqns
0,10.0,11.0,361.0,page1,1503.00098,523.0,156.0,1716.0,4
1,12.0,11.0,388.0,page1,1503.00098,588.0,101.0,1111.0,4
2,15.0,24.0,368.0,page1,1503.00098,653.0,141.0,3384.0,4
3,16.0,22.0,383.0,page1,1503.00098,704.0,113.0,2486.0,4
4,18.0,22.0,156.0,page2,1503.00098,89.0,39.0,858.0,13


In [93]:
new_papers_df[new_papers_df['eqns']<=20]

Unnamed: 0,entity_id,height,left,page,paper,top,width,area,eqns
0,10.0,11.0,361.0,page1,1503.00098,523.0,156.0,1716.0,4
1,12.0,11.0,388.0,page1,1503.00098,588.0,101.0,1111.0,4
2,15.0,24.0,368.0,page1,1503.00098,653.0,141.0,3384.0,4
3,16.0,22.0,383.0,page1,1503.00098,704.0,113.0,2486.0,4
4,18.0,22.0,156.0,page2,1503.00098,89.0,39.0,858.0,13
...,...,...,...,...,...,...,...,...,...
7507,0.0,35.0,208.0,page22,0705.00051,181.0,193.0,6755.0,2
7508,0.0,35.0,170.0,page22,0705.00051,502.0,270.0,9450.0,2
7509,262.0,48.0,305.0,page9,1501.00043,488.0,128.0,6144.0,1
7510,321.0,19.0,94.0,page10,1501.00043,733.0,133.0,2527.0,1


In [94]:
new_papers_df['remove'] = new_papers_df.paper.apply(lambda x: remove_papers(x))

In [95]:
new_papers_df[(new_papers_df['eqns']<=20) & (new_papers_df['remove']==0)].drop_duplicates()

Unnamed: 0,entity_id,height,left,page,paper,top,width,area,eqns,remove
0,10.0,11.0,361.0,page1,1503.00098,523.0,156.0,1716.0,4,0
1,12.0,11.0,388.0,page1,1503.00098,588.0,101.0,1111.0,4,0
2,15.0,24.0,368.0,page1,1503.00098,653.0,141.0,3384.0,4,0
3,16.0,22.0,383.0,page1,1503.00098,704.0,113.0,2486.0,4,0
4,18.0,22.0,156.0,page2,1503.00098,89.0,39.0,858.0,13,0
...,...,...,...,...,...,...,...,...,...,...
7507,0.0,35.0,208.0,page22,0705.00051,181.0,193.0,6755.0,2,0
7508,0.0,35.0,170.0,page22,0705.00051,502.0,270.0,9450.0,2,0
7509,262.0,48.0,305.0,page9,1501.00043,488.0,128.0,6144.0,1,0
7510,321.0,19.0,94.0,page10,1501.00043,733.0,133.0,2527.0,1,0
