# Gathering Stats about Latvian Periodicals

We will explore zip files that store plaintext files of Latvian periodicals. We will gather some statistics about the periodicals and their content.

## Years published

Our first task will be to extract years from the filenames of the periodicals. We will use regular expressions to extract the years from the filenames.

The file names are stored in zip files. We will use the `zipfile` module to extract the filenames from the zip files.

In [1]:
# first get Python version
import sys
print(f"Python version: {sys.version}")
from pathlib import Path
from datetime import datetime
print(f"Current date and time: {datetime.now()}")
# computer CPU type
import platform
print(f"Computer processor: {platform.processor()}")
# print current working directory
print(f"Current working directory: {Path.cwd()}")
import zipfile
import re
import pandas as pd
print(f"Pandas version: {pd.__version__}")
from tqdm import tqdm
# let's import visualization library Plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly import __version__ as plotly_version
print(f"Plotly version: {plotly_version}")


Python version: 3.12.2 (tags/v3.12.2:6abddd9, Feb  6 2024, 21:26:36) [MSC v.1937 64 bit (AMD64)]
Current date and time: 2024-06-14 11:24:58.258054
Computer processor: Intel64 Family 6 Model 60 Stepping 3, GenuineIntel
Current working directory: c:\Users\vsaules\Github\lnb_transports\notebooks
Pandas version: 2.2.1
Plotly version: 5.19.0


In [2]:
src_folder = Path("I:/zips")
assert src_folder.exists(), f"Source folder {src_folder} does not exist"
print(f"Source folder: {src_folder}")
# get list of files that contain word articles in them
files = list(src_folder.glob("*articles*.zip"))
print(f"Number of files: {len(files)}")


Source folder: I:\zips
Number of files: 118


In [3]:
# now we will write a function that will extract file names from a single zip file
def get_file_names(zip_file: Path, skip=1) -> list:
    with zipfile.ZipFile(zip_file, "r") as z:
        # get rid of folder names - we know first one is generally folder
        file_paths = z.namelist()[skip:]
        # now we only want file paths without folders
        file_names = [Path(file_path).name for file_path in file_paths]
        return file_names
z_file = files[0]
print(f"File: {z_file}")
file_names = get_file_names(z_file)
print(f"Number of files in zip: {len(file_names)}")

File: I:\zips\adelaides_latviesu_zinotajs_articles.zip
Number of files in zip: 725


In [22]:
# first 3 filenames
print(file_names[:3])

['xalz1962n114_001_plaintext_s01.txt', 'xalz1962n114_001_plaintext_s02.txt', 'xalz1962n114_002_plaintext_s03.txt']


In [4]:
# now let's write a function that will take a single file name and extract following two pieces of information:
# title - which is any text before year
# year - which is first 4 digit number in the file name
# we will use regular expression to extract year from the file name
def get_title_and_year(file_name: str) -> tuple:
    # regular expression to extract year
    year = re.search(r"\d{4}", file_name).group()
    # if length of year is not 4, return None
    if len(year) != 4:
        return {}
    # title is any text before year
    title = file_name.split(year)[0].strip()
    return {"title": title, "year": int(year)}
# let's test this function on first file name
d = get_title_and_year(file_names[0])
print(f"Title: {d['title']}, Year: {d['year']}")


Title: xalz, Year: 1962


In [5]:
# now let us write a function that given filenames will return a list of dictionaries
# each dictionary will have two keys: title and year
def get_titles_and_years(file_names: list) -> list:
    return [get_title_and_year(file_name) for file_name in file_names]
# let's test this function
titles_and_years = get_titles_and_years(file_names)
# how many titles and years
print(f"Number of titles and years: {len(titles_and_years)}")
# first 3 titles and years
print(titles_and_years[:3])
# last 3 titles and years
print(titles_and_years[-3:])

Number of titles and years: 725
[{'title': 'xalz', 'year': 1962}, {'title': 'xalz', 'year': 1962}, {'title': 'xalz', 'year': 1962}]
[{'title': 'xalz', 'year': 2003}, {'title': 'xalz', 'year': 2003}, {'title': 'xalz', 'year': 2003}]


In [6]:
# now let us write a function that will take list of dictionaries of  years and titles
# additional arguments will be start_year with default value of 1920 and end_year with default value of 1940
# this function will convert this list into pandas DataFrame and extract following information:
# min_year, max_year, total count of publications, count of publications between start_year and end_year (inclusive)
def get_summary(titles_and_years: list, start_year=1920, end_year=1940) -> dict:
    df = pd.DataFrame(titles_and_years)
    min_year = df["year"].min()
    max_year = df["year"].max()
    total_count = df.shape[0]
    count_between = df.query(f"{start_year} <= year <= {end_year}").shape[0]
    return {"min_year": min_year, "max_year": max_year, "total_count": total_count, f"count_{start_year}_{end_year}": count_between}
# let's test this function
summary = get_summary(titles_and_years)
print(summary)

{'min_year': 1962, 'max_year': 2003, 'total_count': 725, 'count_1920_1940': 0}


In [None]:
# let's create a function that given a zip file will return summary of all files in that zip
# in addition the dictionary will contain key publication that will contain zip file stem
def get_zip_summary(zip_file: Path, 
                    remove="_articles",
                    start_year = 1920,
                    end_year = 1940) -> dict:
    file_names = get_file_names(zip_file)
    titles_and_years = get_titles_and_years(file_names)
    summary = get_summary(titles_and_years, start_year=start_year, end_year=end_year)
    summary["publication"] = zip_file.stem
    # let's remove from publication name
    summary["publication"] = summary["publication"].replace(remove, "")
    # let us also add full zip file path without PosixPath using forward slashes
    summary["zip_file"] = zip_file.as_posix()
    return summary
# let's test this function
zip_summary = get_zip_summary(z_file)
print(zip_summary)

{'min_year': 1962, 'max_year': 2003, 'total_count': 725, 'count_1920_1940': 0, 'publication': 'adelaides_latviesu_zinotajs', 'zip_file': 'I:/zips/adelaides_latviesu_zinotajs_articles.zip'}


In [25]:
# let us write a function that will take a list of dictionaries of years and titles and will return value counts of years as a dictionary
# keys will be of form "yr_{year}" and values will be counts of publications in that year
def get_year_counts(titles_and_years: list, key_prefix="yr_") -> dict:
    df = pd.DataFrame(titles_and_years)
    value_counts_dict = df["year"].value_counts().to_dict()
    # convert keys to string with key_prefix
    value_counts_dict = {f"{key_prefix}{k}": v for k, v in value_counts_dict.items()}
    # add total count of years
    value_counts_dict["active_years_count"] = len(value_counts_dict) # alternative: df["year"].nunique()
    # we chose active_years_count so to avoid potentially interfering with yr_ prefix
    # add publication count
    value_counts_dict["publication_count"] = df.shape[0]
    # replace dictionary with one sorted by key
    value_counts_dict = dict(sorted(value_counts_dict.items()))
    return value_counts_dict
# let's test this function
year_counts = get_year_counts(titles_and_years)
print(year_counts)

{'active_years_count': 12, 'publication_count': 725, 'yr_1962': 14, 'yr_1977': 19, 'yr_1988': 21, 'yr_1995': 43, 'yr_1996': 76, 'yr_1997': 89, 'yr_1998': 60, 'yr_1999': 85, 'yr_2000': 89, 'yr_2001': 108, 'yr_2002': 110, 'yr_2003': 11}


In [26]:
# let's create a function that given a zip file will return year counts in that zip
# in addition the dictionary will contain key publication that will contain zip file stem
# also there will be key zip_file that will contain full path to zip file in posix format
def get_zip_year_counts(zip_file: Path, remove="_articles") -> dict:
    file_names = get_file_names(zip_file)
    titles_and_years = get_titles_and_years(file_names)
    year_counts = {}
    year_counts["publication"] = zip_file.stem
    # let's remove from publication name
    year_counts["publication"] = year_counts["publication"].replace(remove, "")
    # let us also add full zip file path without PosixPath using forward slashes
    year_counts["zip_file"] = zip_file.as_posix()
    # now let's update year_counts with results from get_year_counts
    year_counts.update(get_year_counts(titles_and_years))

    return year_counts

# let's test this function
year_counts = get_zip_year_counts(z_file)
print(year_counts)

{'publication': 'adelaides_latviesu_zinotajs', 'zip_file': 'I:/zips/adelaides_latviesu_zinotajs_articles.zip', 'active_years_count': 12, 'publication_count': 725, 'yr_1962': 14, 'yr_1977': 19, 'yr_1988': 21, 'yr_1995': 43, 'yr_1996': 76, 'yr_1997': 89, 'yr_1998': 60, 'yr_1999': 85, 'yr_2000': 89, 'yr_2001': 108, 'yr_2002': 110, 'yr_2003': 11}


In [27]:
# now let us write a function that will take a list of zip files and return a DataFrame with year counts along with publication name and full zip file path
def get_all_year_counts(zip_files: list,
                        remove_postfix="_articles",
                        csv_file = Path("../csv/articles_publications_yearly_summary.csv"),
                        excel_file = Path("../xlsx/articles_publications_yearly_summary.xlsx"),
                        parquet_file = Path("../parquet/articles_publications_yearly_summary.parquet")
                        ) -> pd.DataFrame:

    all_year_counts = [get_zip_year_counts(zip_file, remove=remove_postfix) for zip_file in tqdm(zip_files)]
    df = pd.DataFrame(all_year_counts)
    # save to csv
    if csv_file:
        print(f"Saving to csv file: {csv_file}")
        df.to_csv(csv_file, index=False)
    # save to excel
    if excel_file:
        print(f"Saving to excel file: {excel_file}")
        df.to_excel(excel_file, index=False)
    # save to parquet
    if parquet_file:
        print(f"Saving to parquet file: {parquet_file}")
        df.to_parquet(parquet_file, index=False)
    return df

## Getting yearly summary data

In [28]:
df = get_all_year_counts(files)
# shape
print(df.shape)
# first 3 rows
df.head(3)

100%|██████████| 118/118 [02:19<00:00,  1.18s/it]


Saving to csv file: ..\csv\articles_publications_yearly_summary.csv
Saving to excel file: ..\xlsx\articles_publications_yearly_summary.xlsx
Saving to parquet file: ..\parquet\articles_publications_yearly_summary.parquet
(118, 198)


Unnamed: 0,publication,zip_file,active_years_count,publication_count,yr_1962,yr_1977,yr_1988,yr_1995,yr_1996,yr_1997,...,yr_1862,yr_1863,yr_1864,yr_1865,yr_1866,yr_1867,yr_2019,yr_2020,yr_2021,yr_2022
0,adelaides_latviesu_zinotajs,I:/zips/adelaides_latviesu_zinotajs_articles.zip,12,725,14.0,19.0,21.0,43.0,76.0,89.0,...,,,,,,,,,,
1,australijas_latvietis,I:/zips/australijas_latvietis_articles.zip,63,93179,2102.0,1415.0,1193.0,977.0,896.0,981.0,...,,,,,,,,,,
2,avangards_daugavpils,I:/zips/avangards_daugavpils_articles.zip,31,77967,1842.0,2794.0,1906.0,,,,...,,,,,,,,,,


In [29]:
# let's keep first 4 columns in original order but sort rest of the columns
# we will sort columns in ascending order
columns = list(df.columns[:4]) + sorted(df.columns[4:])
df = df[columns]
# first 3 rows
df.head(3)

Unnamed: 0,publication,zip_file,active_years_count,publication_count,yr_1822,yr_1823,yr_1824,yr_1825,yr_1826,yr_1827,...,yr_2006,yr_2007,yr_2008,yr_2009,yr_2010,yr_2011,yr_2019,yr_2020,yr_2021,yr_2022
0,adelaides_latviesu_zinotajs,I:/zips/adelaides_latviesu_zinotajs_articles.zip,12,725,,,,,,,...,,,,,,,,,,
1,australijas_latvietis,I:/zips/australijas_latvietis_articles.zip,63,93179,,,,,,,...,1152.0,1259.0,1287.0,1212.0,1251.0,1252.0,,,,
2,avangards_daugavpils,I:/zips/avangards_daugavpils_articles.zip,31,77967,,,,,,,...,,,,,,,,,,


In [30]:
# let's get rid of yr_ prefix from columns
df.columns = [col.replace("yr_", "") for col in df.columns]
# first 3 rows
df.head(3)

Unnamed: 0,publication,zip_file,active_years_count,publication_count,1822,1823,1824,1825,1826,1827,...,2006,2007,2008,2009,2010,2011,2019,2020,2021,2022
0,adelaides_latviesu_zinotajs,I:/zips/adelaides_latviesu_zinotajs_articles.zip,12,725,,,,,,,...,,,,,,,,,,
1,australijas_latvietis,I:/zips/australijas_latvietis_articles.zip,63,93179,,,,,,,...,1152.0,1259.0,1287.0,1212.0,1251.0,1252.0,,,,
2,avangards_daugavpils,I:/zips/avangards_daugavpils_articles.zip,31,77967,,,,,,,...,,,,,,,,,,


In [32]:
# let's rename publication_count column to segment_count
df.rename(columns={"publication_count": "segment_count"}, inplace=True)
# let's rename publication column to title
df.rename(columns={"publication": "title"}, inplace=True)
# let's rename active_years_count column to years_active
df.rename(columns={"active_years_count": "years_active"}, inplace=True)
# let's rename zip_file column to zip_path
df.rename(columns={"zip_file": "zip_path"}, inplace=True)
# first 3 rows
df.head(3)

Unnamed: 0,title,zip_path,years_active,segment_count,1822,1823,1824,1825,1826,1827,...,2006,2007,2008,2009,2010,2011,2019,2020,2021,2022
0,adelaides_latviesu_zinotajs,I:/zips/adelaides_latviesu_zinotajs_articles.zip,12,725,,,,,,,...,,,,,,,,,,
1,australijas_latvietis,I:/zips/australijas_latvietis_articles.zip,63,93179,,,,,,,...,1152.0,1259.0,1287.0,1212.0,1251.0,1252.0,,,,
2,avangards_daugavpils,I:/zips/avangards_daugavpils_articles.zip,31,77967,,,,,,,...,,,,,,,,,,


In [38]:
# head
df.head()

Unnamed: 0,title,zip_path,years_active,segment_count,1822,1823,1824,1825,1826,1827,...,2007,2008,2009,2010,2011,2019,2020,2021,2022,first_year
0,adelaides_latviesu_zinotajs,I:/zips/adelaides_latviesu_zinotajs_articles.zip,12,725,,,,,,,...,,,,,,,,,,2003
1,australijas_latvietis,I:/zips/australijas_latvietis_articles.zip,63,93179,,,,,,,...,1259.0,1287.0,1212.0,1251.0,1252.0,,,,,1949
2,avangards_daugavpils,I:/zips/avangards_daugavpils_articles.zip,31,77967,,,,,,,...,,,,,,,,,,1992
3,avots,I:/zips/avots_articles.zip,11,7567,,,,,,,...,,,,,,,,,,1915
4,avots_latvijas_rakstnieku_savienibas_zurnals,I:/zips/avots_latvijas_rakstnieku_savienibas_z...,6,1188,,,,,,,...,,,,,,,,,,1992


In [44]:
# for each title let's find earliest year it was active and latest year it was active
# Note we could have done this earlier when we had smaller dictionary of only years with non-zero counts
# we will create two new columns: first_year and last_year
# we will use apply function to apply a function to each row
# remove first_year and last_year columns if they already exist
df = df.drop(columns=["first_year"], errors="ignore")
df = df.drop(columns=["last_year"], errors="ignore")
# convert year columns to numeric
df.iloc[:, 4:] = df.iloc[:, 4:].apply(pd.to_numeric)
# idxmax and inxmin will return columns with max and min values
# we do not need those
# we want first and last occurence of non-zero values !!
# let's find first year
# let's create a function that given a row will return column with first non-zero value
def get_first_year(row: pd.Series) -> int:
    # get all years
    years = row.iloc[4:]
    # get first non-zero year
    first_year = years[years > 0].index[0]
    return first_year
df["first_year"] = df.apply(get_first_year, axis=1)
# insert this columns after 4th column - segment_count column
df.insert(4, "first_year", df.pop("first_year"))
# let's find last year
# let's create a function that given a row will return column with last non-zero value
def get_last_year(row: pd.Series) -> int:
    # get all years
    years = row.iloc[5:]
    # get last non-zero year
    last_year = years[years > 0].index[-1]
    return last_year
df["last_year"] = df.apply(get_last_year, axis=1)
# insert this columns after 5th column - first_year column
df.insert(5, "last_year", df.pop("last_year"))
df.head(3)

Unnamed: 0,title,zip_path,years_active,segment_count,first_year,last_year,1822,1823,1824,1825,...,2006,2007,2008,2009,2010,2011,2019,2020,2021,2022
0,adelaides_latviesu_zinotajs,I:/zips/adelaides_latviesu_zinotajs_articles.zip,12,725,1962,2003,,,,,...,,,,,,,,,,
1,australijas_latvietis,I:/zips/australijas_latvietis_articles.zip,63,93179,1949,2011,,,,,...,1152.0,1259.0,1287.0,1212.0,1251.0,1252.0,,,,
2,avangards_daugavpils,I:/zips/avangards_daugavpils_articles.zip,31,77967,1962,1992,,,,,...,,,,,,,,,,


In [45]:
# let's save df again to csv, excel and parquet
df.to_csv("../csv/articles_publications_yearly_summary.csv", index=False)
df.to_excel("../xlsx/articles_publications_yearly_summary.xlsx", index=False)
df.to_parquet("../parquet/articles_publications_yearly_summary.parquet", index=False)

In [31]:
# let's plot total publication count from all publications over years as a line chart
# let's get total publication count for each year
total_publications = df.iloc[:, 4:].sum()
# let's create a line chart
fig = go.Figure()
fig.add_trace(go.Scatter(x=total_publications.index, y=total_publications.values, mode="lines+markers"))
fig.update_layout(title="Total Publications Over Years", xaxis_title="Year", yaxis_title="Publication Count")
fig.show()

In [8]:
# now let us write a function that will take a list of zip files
# it will return a pandas DataFrame with summary of all files in all zip files
def get_all_zips_summary(zip_files: list,
                            start_year = 1920,
                            end_year = 1940,
                          csv_file = Path("../csv/articles_publications_summary.csv"),
                          excel_file = Path("../xlsx/articles_publications_summary.xlsx"),
                          parquet_file = Path("../parquet/articles_publications_summary.parquet")) -> pd.DataFrame:
    summaries = [get_zip_summary(zip_file, start_year=start_year, end_year=end_year) for zip_file in tqdm(zip_files)]
    df = pd.DataFrame(summaries)
    # let's  have first column be publication
    df = df[["publication"] + [col for col in df.columns if col != "publication"]]
    if csv_file:
        # create all folders in the path if they do not exist
        csv_file.parent.mkdir(parents=True, exist_ok=True)
        print(f"Saving CSV file to {csv_file}")
        df.to_csv(csv_file, index=False)
    if excel_file:
        # create all folders in the path if they do not exist
        excel_file.parent.mkdir(parents=True, exist_ok=True)
        print(f"Saving Excel file to {excel_file}")
        df.to_excel(excel_file, index=False)
    if parquet_file:
        # create all folders in the path if they do not exist
        parquet_file.parent.mkdir(parents=True, exist_ok=True)
        print(f"Saving Parquet file to {parquet_file}")
        df.to_parquet(parquet_file, index=False)
    return df
# let's test this function
# df = get_all_zips_summary(files)
# # shape
# print(df.shape)
# # first 5 rows
# df.head()

In [44]:
# let us save summary to a CSV file
# csv_folder = Path("../csv")
# # create folder if it does not exist
# csv_folder.mkdir(exist_ok=True)
# csv_file = csv_folder / "articles_publications_summary.csv"
# # let's  have first column be publication
# df = df[["publication"] + [col for col in df.columns if col != "publication"]]
# df.to_csv(csv_file, index=False)

In [45]:
# # let's also save to excel
# excel_folder = Path("../excel")
# # create folder if it does not exist
# excel_folder.mkdir(exist_ok=True)
# excel_file = excel_folder / "articles_publications_summary.xlsx"
# df.to_excel(excel_file, index=False)

## Loading the data from parquet

In [12]:
parquet_folder = Path("../parquet")
assert parquet_folder.exists(), f"Parquet folder {parquet_folder} does not exist"
parquet_file = parquet_folder / "articles_publications_summary.parquet"
assert parquet_file.exists(), f"Parquet file {parquet_file} does not exist"
df = pd.read_parquet(parquet_file)
# shape
print(df.shape)
# first 5 rows
df.head()

(118, 6)


Unnamed: 0,publication,min_year,max_year,total_count,count_1920_1940,zip_file
0,adelaides_latviesu_zinotajs,1962,2003,725,0,I:/zips/adelaides_latviesu_zinotajs_articles.zip
1,australijas_latvietis,1949,2011,93179,0,I:/zips/australijas_latvietis_articles.zip
2,avangards_daugavpils,1962,1992,77967,0,I:/zips/avangards_daugavpils_articles.zip
3,avots,1905,1915,7567,0,I:/zips/avots_articles.zip
4,avots_latvijas_rakstnieku_savienibas_zurnals,1987,1992,1188,0,I:/zips/avots_latvijas_rakstnieku_savienibas_z...


In [13]:
# let's filter only those publications which have more than 0 publication in 1920-1940 period
df_filtered = df.query("count_1920_1940 > 0")
# shape
print(df_filtered.shape)
# first 5 rows
df_filtered.head()

(41, 6)


Unnamed: 0,publication,min_year,max_year,total_count,count_1920_1940,zip_file
5,baltijas_vestnesis,1868,1920,78012,4739,I:/zips/baltijas_vestnesis_articles.zip
10,briva_zeme,1919,1940,259067,255273,I:/zips/briva_zeme_articles.zip
12,burtnieks,1927,1945,2772,2723,I:/zips/burtnieks_articles.zip
14,cina,1904,1991,540722,8341,I:/zips/cina_articles.zip
15,dadzis,1912,2008,40234,219,I:/zips/dadzis_articles.zip


In [14]:
# let's sourt by count_1920_1940 and plot bar chart
df_filtered = df_filtered.sort_values("count_1920_1940", ascending=False)
fig = px.bar(df_filtered, x="publication", y="count_1920_1940", title="Publications 1920-1940")
# let's use log scale on y axis
fig.update_yaxes(type="log")
# html folder
# html_folder = Path("../html")
# # create folder if it does not exist
# html_folder.mkdir(exist_ok=True)
# # download plot as HTML file
# fig.write_html(html_folder / "articles_publications_summary.html")
# # save png file
# png_folder = Path("../png")
# # create folder if it does not exist
# png_folder.mkdir(exist_ok=True)
# png_file = png_folder / "articles_publications_summary.png"
# fig.write_image(png_file)
fig.show()

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   publication      118 non-null    object
 1   min_year         118 non-null    int64 
 2   max_year         118 non-null    int64 
 3   total_count      118 non-null    int64 
 4   count_1920_1940  118 non-null    int64 
 5   zip_file         118 non-null    object
dtypes: int64(4), object(2)
memory usage: 5.7+ KB


In [60]:
# TODO FIXME bars currently are too small to see anything
# let's create a visualization of total counts for each publication from min_year to max_year
# y axis will be publication
# x axis will be min_year to max_year
# size of bar will be total_count
# fig = go.Figure()
# for i, row in df.iterrows():
#     fig.add_trace(go.Bar(x=[row["min_year"], row["max_year"]],
#                             y=[row["publication"], row["publication"]],
#                             orientation="h",
#                             marker=dict(color="blue"),
#                             text=row["total_count"],
#                             textposition="inside",
#                             name=row["publication"]))
# # let's add title and labels
# fig.update_layout(title="Publications by year",
#                     xaxis_title="Year",
#                     yaxis_title="Publication",
#                     # barmode="stack"
#                     )
# # download plot as HTML file
# # fig.write_html(html_folder / "articles_publications_by_year.html")
# # # save png file
# # png_file = png_folder / "articles_publications_by_year.png"
# # fig.write_image(png_file)
# fig.show()