# Visualize Top N Authors

We will be visualizing the top N authors for specific terms.

## Loading Libraries

In [1]:
# first python and pandas versions

import sys
from pathlib import Path
import pickle
import math
import pandas as pd
from tqdm import tqdm

print('Python version: ', sys.version)
# python execution path
print('Python executable: ', sys.executable)
# get current date and time
from datetime import datetime
now = datetime.now()
print('Date and time: ', now.strftime("%d/%m/%Y %H:%M:%S"))

print('Pandas version: ', pd.__version__)

# load plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly
from plotly.subplots import make_subplots
# print version
print('Plotly version: ', plotly.__version__)

Python version:  3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]
Python executable:  c:\pyenvs\venv312\Scripts\python.exe
Date and time:  29/05/2025 13:15:50
Pandas version:  2.2.1
Plotly version:  5.19.0


## Load Big Data

In [2]:
# src = Path("../../not_repo/latsenrom_2025_05_09.parquet")

# # assert src.exists()
# assert src.is_file(), f"File not found: {src}"
# # loading
# print(f"Loading from {src}")
# df = pd.read_parquet(src)
# # check the dataframe
# # shape
# print(f"df.shape: {df.shape}")
# # head
# df.head()

## Define terms and translations

In [3]:
original_terms_translations = {
    "auto": "auto",	
    "autobuss": "autobus",
    "automobilis": "automobile",
    "dzelzceļš": "railway",
    "mašīna": "machine",
    "motocikls": "motorcycle",
    "ormanis": "horse-drawn cab",
    "pajūgs": "carriage",
    "taksometrs": "taxi",
    "tramvajs": "tram",
    "velosipēds": "bicycle",
    "vezums": "wagon",
}
print(f"We have {len(original_terms_translations)} original terms translations.")

We have 12 original terms translations.


## Functions for top n authors for each term as histogram

In [4]:
def get_top_n_authors_df(df, term, n=5, verbose=True):
    # find top n authors with most term
    top_authors = df[df.lemma == term].groupby('author').size().nlargest(n)
    # create a histogram of top n authors with most term
    vis_df = top_authors.head(n)
    return vis_df
# let's get trace given we already have a authors_df
def get_authors_trace(authors_df, term, y_range_max=None, font_size=18, color="rgb(0, 61, 165)", term_dict=None):
    if term_dict:
        name = term_dict[term]
        print(f"Will use name: {name} instead of {term}")
    else:
        name = term
        print(f"Will use name: {name} as is {term}")
    
    trace = go.Bar(
        x=authors_df.index,
        y=authors_df.values,
        name=name,
        # font size
        textfont=dict(size=font_size),
        # marker color
        marker_color=color,
    )
    return trace
# let's create a function that takes df and term_list and returns a figure with subplots for each term
def term_top_authors_subplots(df, 
                              term_list, 
                              top_n = 5, 
                              sort=True, 
                              height_per_trace=300, 
                              columns=3, 
                              get_y_range_max=True, 
                              turn_off_legend=True,
                              font_size=18,
                              width = 1242,
                              term_dict=None,
                              top_authors_dfs=None,
                              colors=None):
    if not top_authors_dfs:
        top_authors_dfs= [get_top_n_authors_df(df, term, top_n) for term in term_list]
    # create a list of traces
    # get top_n authors for each term as df
    if sort:
        # we will sort term_list alphabetically
        # however we also need to match top_authors_dfs to term_list
        # so we will sort top_authors_dfs as well
        if colors is not None:
            assert len(colors) == len(term_list), "Colors list must match term_list length"
            term_list, top_authors_dfs, colors = zip(*sorted(zip(term_list, top_authors_dfs, colors)))
        else:
            term_list, top_authors_dfs = zip(*sorted(zip(term_list, top_authors_dfs)))

    if get_y_range_max:
        # get maximum y value
        y_max = max([df.max() for df in top_authors_dfs])
        # round up to nearest 10
        y_max = math.ceil(y_max / 10) * 10
    else:
        y_max = None
    traces = []

    if colors is None:

        for term, authors_df in tqdm(zip(term_list, top_authors_dfs)):
            trace = get_authors_trace(authors_df, term, y_max, term_dict=term_dict)
            traces.append(trace)
    else:
        # we will use colors for each term
        for term, authors_df, color in tqdm(zip(term_list, top_authors_dfs, colors)):
            trace = get_authors_trace(authors_df, term, y_max, term_dict=term_dict, color=color)
            traces.append(trace)
    # create a subplot
    fig = make_subplots(rows=math.ceil(len(term_list)/columns), cols=columns, subplot_titles=term_list)
    fig.update_layout(height=height_per_trace * len(term_list) // columns)
    # width
    fig.update_layout(width=width)
    if y_max:
        # fig.update_layout(yaxis=dict(range=[0, y_max]))
        print(f"update yaxis range to [0, {y_max}]")
        fig.update_yaxes(range=[0, y_max])
    # fig.print_grid()
    # add each fig to the subplot
    for i, trace in enumerate(traces):
        fig.append_trace(trace, row=i//columns+1, col=i%columns+1)
    # update height to 3000
    # fig.update_layout(height=3000)
    # return fig
    # change font size
    fig.update_layout(font=dict(size=font_size, color="black"))
    if turn_off_legend:
        fig.update_layout(showlegend=False)
    # update annotation font size
    fig.update_layout(annotations=[dict(font=dict(size=font_size))])
    return fig

## Create top n authors for each term


In [5]:
N = 5
# top_authors_df_dict = {term: get_top_n_authors_df(df, term, n=N) for term in tqdm(original_terms_translations.keys())}


In [6]:
# show first term results
# term = list(original_terms_translations.keys())[0]
# print(f"Top {N} authors for term '{term}':")
# top_authors_df_dict[term].head(N)

In [7]:
# let's save all dataframes individually
# output_dir = Path("../parquet/top_authors_dfs")
# # create directory if not exists
# output_dir.mkdir(parents=True, exist_ok=True)
# # let's make csv directory as well
# csv_output_dir = Path("../csv/top_authors_dfs")
# # create directory if not exists
# csv_output_dir.mkdir(parents=True, exist_ok=True)
# # save each dataframe to a parquet file
# for term, authors_df in tqdm(top_authors_df_dict.items()):
#     # save to parquet
#     authors_df.to_frame().to_parquet(output_dir / f"{term}_top_{N}.parquet")
#     # also save to csv
#     authors_df.to_frame().to_csv(csv_output_dir / f"{term}_top_{N}.csv", index=True)
#     # # also save to pickle
#     # with open(output_dir / f"{term}.pkl", "wb") as f:
#     #     pickle.dump(authors_df, f)

## Loading Data for Visualization

In [14]:
output_dir = Path("../parquet/top_authors_dfs")
# how many parquet files do we have?
parquet_files = list(output_dir.glob("*.parquet"))
print(f"We have {len(parquet_files)} parquet files in {output_dir}")
# we will save the them in a dictionary with term being the first part of filename when split by _
top_authors_series_dict = {}
for file in tqdm(parquet_files):
    # get term from filename
    term = file.stem.split("_")[0]
    # load dataframe
    df = pd.read_parquet(file)
    # add to dictionary
    top_authors_series_dict[term] = df.squeeze()  # use squeeze to convert single-column DataFrame to Series
# print keys for verification
print(f"Top authors df dictionary keys: {list(top_authors_series_dict.keys())[:10]}... (total {len(top_authors_series_dict)})")
# assert they match original_terms_translations
assert set(top_authors_series_dict.keys()) == set(original_terms_translations.keys()), "Top authors df dictionary keys do not match original terms translations keys"
print(f"We are good, top authors df dictionary keys match original terms translations keys.")

We have 12 parquet files in ..\parquet\top_authors_dfs


100%|██████████| 12/12 [00:00<00:00, 221.76it/s]

Top authors df dictionary keys: ['autobuss', 'automobilis', 'auto', 'dzelzceļš', 'mašīna', 'motocikls', 'ormanis', 'pajūgs', 'taksometrs', 'tramvajs']... (total 12)
We are good, top authors df dictionary keys match original terms translations keys.





## Visualizing top n authors within subplots

In [15]:
# let's use values from translation dictionary
term_translations = list(original_terms_translations.values())
# terms 1 fig
fig = term_top_authors_subplots(df, term_translations, font_size=22, top_authors_dfs=top_authors_series_dict.values())
height = 1140 # 285 * 4
width = 1480 # 370 * 4
# update height and width

fig.update_layout(height=height)
fig.update_layout(width=width)
SAVE_HTML = False
# save
if SAVE_HTML:
    fig.write_html("../html/terms_1_top_5_authors.html")
fig.show()

12it [00:00, 1844.19it/s]

Will use name: auto as is auto
Will use name: autobus as is autobus
Will use name: automobile as is automobile
Will use name: bicycle as is bicycle
Will use name: carriage as is carriage
Will use name: horse-drawn cab as is horse-drawn cab
Will use name: machine as is machine
Will use name: motorcycle as is motorcycle
Will use name: railway as is railway
Will use name: taxi as is taxi
Will use name: tram as is tram
Will use name: wagon as is wagon
update yaxis range to [0, 440]





## Visualizing with color coding for horse and motor

In [12]:
moto_color = "rgb(0, 61, 165)" # Pantone 293
horse_color = "rgb(255, 194, 90)" # complimentary color to Pantone 293
term_colors = {
    "auto": moto_color,	
    "autobuss": moto_color,
    "automobilis": moto_color,
    "dzelzceļš": moto_color,
    "mašīna": moto_color,
    "motocikls": moto_color,
    "ormanis": horse_color,
    "pajūgs": horse_color,
    "taksometrs": moto_color,
    "tramvajs": moto_color,
    "velosipēds": moto_color, # techically this would be self-propelled, but we will use moto color for consistency with previous visualizations
    "vezums": horse_color,
}



In [16]:
# now let's create the same visualization but with colors
fig = term_top_authors_subplots(df, 
                                 term_translations, 
                                 font_size=22, 
                                 top_authors_dfs=top_authors_series_dict.values(),
                                 colors=list(term_colors.values()))
height = 1140 # 285 * 4
width = 1480 # 370 * 4
# update height and width
fig.update_layout(height=height)
fig.update_layout(width=width)
fig.show()

12it [00:00, 24350.10it/s]

Will use name: auto as is auto
Will use name: autobus as is autobus
Will use name: automobile as is automobile
Will use name: bicycle as is bicycle
Will use name: carriage as is carriage
Will use name: horse-drawn cab as is horse-drawn cab
Will use name: machine as is machine
Will use name: motorcycle as is motorcycle
Will use name: railway as is railway
Will use name: taxi as is taxi
Will use name: tram as is tram
Will use name: wagon as is wagon
update yaxis range to [0, 440]



