In [7]:
import os
import numpy as np
import pandas as pd
import plotly.express as px

In [87]:
# if the full processed file exists, load it
if os.path.exists("unicode_counts.npy"):
    unicode_values = np.load("unicode_counts.npy")
else: # assume we have to still aggregate the files
    # load all the files from the charcounts folder
    files = os.listdir("charcounts/") # should check if this exists first, and throw error telling user to run count_unicode_pile.py
    files = [x for x in files if x.endswith(".npy")]

    # load all the files
    unicode_values = np.zeros((1114112,), dtype=np.int64)
    for xfile in files:
        unicode_values += np.load("charcounts/"+xfile)
        # delete the file
        # os.remove("charcounts/"+file)
    # save the file
    np.save("unicode_counts.npy", unicode_values)

In [88]:
# create a dataframe for easier processing/plotting
df = pd.DataFrame({"unicode": np.arange(0, 1114112), "count": unicode_values})

# create a new column with the unicode character
df["unicode_char"] = df["unicode"].apply(lambda x: repr(chr(x)))

# sort by count
df = df.sort_values(by="count", ascending=False)

# drop the unicode index column
df = df.drop(columns=["unicode"])

# create a new column with the percentage of the pile
total_pile_chars = df["count"].sum()
df["perc_pile"] = df["count"].apply(lambda x: x/total_pile_chars * 100)
# round the percentage and convert to string
df["perc_pile"] = df["perc_pile"].apply(lambda x: str(round(x, 6))+"%")

In [93]:
# create an interactive horizontal bar chart comparing the top N unicode characters
def display_bar_chart(df, N=512, Nskip=0):
    # select the top N characters
    df_sel = df.iloc[Nskip:N+Nskip]
    # reverse the order
    df_sel = df_sel.iloc[::-1]

    # create the plot
    # display the unicode character as the y axis, and the percentage as the x axis, with the total count at the end of each bar as well
    fig = px.bar(df_sel, x="perc_pile", y="unicode_char", text="count", orientation="h", title="Top "+str(Nskip)+" to "+str(N+Nskip)+" characters in the pile")
    # scale the plot to better fit the screen
    fig.update_layout(height=N*18) # 18 allows each char to display
    # add x ticks to the top, and scale the x axis logarithmically
    fig.update_xaxes(side="top")
    fig.update_xaxes(type="log")
    # set the theme to dark
    fig.update_layout(template="plotly_dark")
    # make the text count on the outside
    fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
    # if the percentage is too small, use scientific notation, otherwise use normal percentage
    fig.update_xaxes(tickformat=".2e" if Nskip > 0 else ".2f")
    # set the x axis title to note that it is in percentage
    fig.update_xaxes(title_text="perc_pile (%)")
    # show the plot
    fig.show()

In [94]:
display_bar_chart(df, N=48, Nskip=0)
display_bar_chart(df, N=48, Nskip=48)
display_bar_chart(df, N=48, Nskip=96)
display_bar_chart(df, N=48, Nskip=144)
display_bar_chart(df, N=48, Nskip=192)
display_bar_chart(df, N=48, Nskip=240)
# display_bar_chart(df, N=48, Nskip=288)
# display_bar_chart(df, N=48, Nskip=336)
# display_bar_chart(df, N=48, Nskip=384)
# display_bar_chart(df, N=48, Nskip=432)
# display_bar_chart(df, N=48, Nskip=480)