# Get Statistics on Existing Data

Before we proceed with conversion of the data to a format that can be used by the NoSketch Engine let's first get some statistics on the existing data. This will help us understand the data better and also help us in making decisions on how to proceed with the data conversion.

In [5]:
from pathlib import Path
import zipfile
# TODO Write a function that takes a zipfile and provides statistics on its contents
# The function should return a dictionary with the following keys
# - number_of_first_level_folders (this would indicate how many individual issues are in the zip) 
# - number_of_files_across_all_folders
# - total_size
# - largest_file - value would be tuple with file name and size
# - smallest_file - value would be tuple with file name and size
# - files_with_no_extension
# - common_extensions - value would be a list of tuples with extension and count
# TODO could add more keys if you think they would be useful

# it would be nice to also have the following optional keys, provided as arguments to the function
# suffix_with_extension_count
# for example _ocr_xml_count - would be a count of files ending with _ocr.xml - 
# note how key is constructed from suffix but uses _ instead of . where applicable

def get_stats(src_zip: str|Path, suffixes=("_ocr.xml","_alto.xml")) -> dict:
    stats_dict = {}
    # TODO
    return stats_dict


In [6]:
# TODO
# when get_stats function is written
# next step would be to write a function that takes a folder and provides statistics on all zip files in the folder and its subfolders
# the function should return a dictionary with the following keys
# - number_of_zip_files
# - total_size
# - largest_zip_file - value would be tuple with file name and size
# - smallest_zip_file - value would be tuple with file name and size
# - zip_file_stats - value would be a dictionary where keys are zip file names and values are the output of get_stats function (another dictionary)
# thus zip_file_stats key value would be a dictionary of dictionaries

def get_stats_on_zips(src_folder: str|Path) -> dict:
    zip_dict = {}
    # TODO
    return zip_dict

## Statistics on Articles subfolder

Our institutution has a large number of plaintext articles already available and stored in our local system. These articles are stored in the `articles` subfolder. Let's first get some statistics on these articles.


In [17]:
import zipfile
from pathlib import Path
import re
import statistics
import logging
from typing import Dict, Union, List

# Set up logging configuration
logging.basicConfig(level=logging.INFO)  # Change to DEBUG for detailed output

def get_article_stats(zip_file: Union[str, Path], debug: bool = False) -> Dict[str, Union[int, float, List[int]]]:
    """Extract and compute statistics from a given zip file containing articles."""
    # Initialize the statistics dictionary
    article_stats_dict = {
        "number_of_articles": 0,
        "number_of_issues": 0,
        "min_year": None,
        "max_year": None,
        "year_issue_count": {},  
        "years_active": [],       
        "min_file_size": None,
        "max_file_size": None,
        "median_file_size": None,
        "average_file_size": None,
    }

    try:
        # Open the zip file
        with zipfile.ZipFile(zip_file, "r") as zip_ref:
            file_list = zip_ref.namelist()
            article_stats_dict["number_of_articles"] = len(file_list)

            if debug:
                logging.debug(f"Processing zip file: {zip_file}")
                logging.debug(f"First 5 files in zip: {file_list[:5]}")

            if article_stats_dict["number_of_articles"] == 0:
                return article_stats_dict

            # Initialize variables for statistics
            years_active = set()
            publication_issue_set = set()
            file_size_list = []

            # Compile a more flexible regex pattern for partial matching
            pattern = re.compile(r"(?P<publication>[a-z]+).*?(?P<year>\d{4}).*?n(?P<issue1>\d{1,3})(?:-(?P<issue2>\d{1,3}))?_\d{3}.*\.txt")

            for file in file_list:
                # Collect file sizes
                file_size = round(zip_ref.getinfo(file).file_size / 1024, 3)
                file_size_list.append(file_size)

                # Attempt partial matching with regex
                match = pattern.search(file)
                if match:
                    pub_name = match.group('publication')
                    year = int(match.group('year'))
                    issue_start = int(match.group('issue1'))
                    issue_end = int(match.group('issue2')) if match.group('issue2') else issue_start

                    if debug and len(publication_issue_set) < 5:
                        logging.debug(f"Parsed file: {file} -> Publication: {pub_name}, Year: {year}, Issue Range: {issue_start}-{issue_end}")

                    years_active.add(year)

                    for issue in range(issue_start, issue_end + 1):
                        publication_issue_set.add((pub_name, year, issue))
                        article_stats_dict["year_issue_count"][year] = article_stats_dict["year_issue_count"].get(year, 0) + 1
                elif debug and len(publication_issue_set) < 5:
                    logging.debug(f"No partial match found for file: {file}")

            # Compute file size statistics
            if file_size_list:
                article_stats_dict["min_file_size"] = min(file_size_list)
                article_stats_dict["max_file_size"] = max(file_size_list)
                article_stats_dict["average_file_size"] = round(sum(file_size_list) / len(file_size_list), 3)
                article_stats_dict["median_file_size"] = round(statistics.median(file_size_list), 3)

            # Calculate aggregated statistics
            article_stats_dict["number_of_issues"] = len(publication_issue_set)
            article_stats_dict["min_year"] = min(years_active) if years_active else None
            article_stats_dict["max_year"] = max(years_active) if years_active else None
            article_stats_dict["years_active"] = sorted(years_active)

            if debug:
                logging.debug(f"Years Active (first 5): {list(years_active)[:5]}")
                logging.debug(f"Unique publication issues (first 5): {list(publication_issue_set)[:5]}")
                logging.debug(f"Year Issue Count: {dict(list(article_stats_dict['year_issue_count'].items())[:5])}")

    except zipfile.BadZipFile:
        logging.error(f"The file {zip_file} is not a valid zip file.")
    except Exception as e:
        logging.error(f"An error occurred while processing {zip_file}: {e}")

    return article_stats_dict

def get_stats_on_articles(src_folder: Union[str, Path], debug: bool = False) -> Dict[str, Dict[str, Union[int, float, List[int]]]]:
    """Process all zip files in the specified folder and return their statistics."""
    article_dict = {}
    
    for file in Path(src_folder).glob("*.zip"):
        logging.info(f"\n=== Processing zip: {file.name} ===")
        article_dict[file.name] = get_article_stats(file, debug)
        
    return article_dict

# if __name__ == "__main__":
#    src_folder = r"I:\zips\articles"  # Specify the source folder
#    stats = get_stats_on_articles(src_folder, debug=False)  # Get stats for articles
#    print("\nFinal stats:", stats)  # Output the final statistics


INFO:root:
=== Processing zip: adelaides_latviesu_zinotajs_articles.zip ===
INFO:root:
=== Processing zip: australijas_latvietis_articles.zip ===
INFO:root:
=== Processing zip: avangards_daugavpils_articles.zip ===
INFO:root:
=== Processing zip: avots_articles.zip ===
INFO:root:
=== Processing zip: avots_latvijas_rakstnieku_savienibas_zurnals_articles.zip ===
INFO:root:
=== Processing zip: baltijas_vestnesis_articles.zip ===
INFO:root:
=== Processing zip: baltija_articles.zip ===
INFO:root:
=== Processing zip: baznicas_zinas_articles.zip ===
INFO:root:
=== Processing zip: berniba_articles.zip ===
INFO:root:
=== Processing zip: briva_latvija_apvienota_londonas_avize_un_latvija_articles.zip ===
INFO:root:
=== Processing zip: briva_zeme_articles.zip ===
INFO:root:
=== Processing zip: briviba_articles.zip ===
INFO:root:
=== Processing zip: burtnieks_articles.zip ===
INFO:root:
=== Processing zip: cela_zimes_articles.zip ===
INFO:root:
=== Processing zip: cina_articles.zip ===
INFO:root:
==


Final stats: {'adelaides_latviesu_zinotajs_articles.zip': {'number_of_articles': 726, 'number_of_issues': 80, 'min_year': 1962, 'max_year': 2003, 'year_issue_count': {1962: 14, 1977: 19, 1988: 21, 1995: 43, 1996: 76, 1997: 89, 1998: 60, 1999: 85, 2000: 89, 2001: 108, 2002: 110, 2003: 11}, 'years_active': [1962, 1977, 1988, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003], 'min_file_size': 0.0, 'max_file_size': 7.533, 'median_file_size': 0.875, 'average_file_size': 1.264}, 'australijas_latvietis_articles.zip': {'number_of_articles': 93180, 'number_of_issues': 0, 'min_year': None, 'max_year': None, 'year_issue_count': {}, 'years_active': [], 'min_file_size': 0.0, 'max_file_size': 52.697, 'median_file_size': 2.66, 'average_file_size': 3.68}, 'avangards_daugavpils_articles.zip': {'number_of_articles': 77968, 'number_of_issues': 4640, 'min_year': 1962, 'max_year': 1992, 'year_issue_count': {1962: 1842, 1963: 2823, 1964: 2788, 1965: 2676, 1966: 2746, 1967: 2701, 1968: 2685, 1969: 2617,

In [25]:
# ! pip install pandas

from typing import Dict, Union, List
import pandas as pd  # Importing pandas

def save_stats_to_csv(article_dict: Dict[str, Dict[str, Union[int, float, List[int]]]], output_file: str):
    """Convert article_dict to a DataFrame and save it as a CSV file, with years handled as integers and total active years calculated."""
    # Flatten the dictionary into a list of rows for the DataFrame
    rows = []
    for file_name, stats in article_dict.items():
        # Ensure that years are treated as integers (or None if missing)
        min_year = int(stats["min_year"]) if stats["min_year"] is not None else None
        max_year = int(stats["max_year"]) if stats["max_year"] is not None else None

        # Calculate the total number of years active as the length of the 'years_active' list
        years_active_count = len(stats.get("years_active", []))

        # Create a row for each zip file's statistics
        row = {
            "file_name": file_name,
            "number_of_articles": stats.get("number_of_articles"),
            "number_of_issues": stats.get("number_of_issues"),
            "min_year": min_year,  # Ensure integer
            "max_year": max_year,  # Ensure integer
            "years_active_count": years_active_count,  # Total number of years active
            "min_file_size": stats.get("min_file_size"),
            "max_file_size": stats.get("max_file_size"),
            "year_issue_count": str(stats.get("year_issue_count")),  # Convert dict to string for CSV
        }
        rows.append(row)

    # Create a DataFrame from the list of rows
    df = pd.DataFrame(rows)

    # Force conversion of 'min_year' and 'max_year' to integers where applicable, but keep NaN where no value exists
    df["min_year"] = pd.to_numeric(df["min_year"], errors='coerce').astype('Int64')  # Nullable integer type
    df["max_year"] = pd.to_numeric(df["max_year"], errors='coerce').astype('Int64')  # Nullable integer type

    # Write the DataFrame to a CSV file
    df.to_csv(output_file, index=False)
    
    # Print the head of the DataFrame to see the first few rows
    print(df.head())  # Default is 5 rows, you can pass any number like df.head(10)

# Example call
if __name__ == "__main__":
    src_folder = r"I:\zips\articles"  # Specify the source folder
    stats = get_stats_on_articles(src_folder, debug=False)  # Get stats for articles
    output_file = r"I:\zips\article_stats.csv"  # Specify the output CSV file path
    save_stats_to_csv(stats, output_file)  # Save the DataFrame to a CSV and print the head

INFO:root:
=== Processing zip: adelaides_latviesu_zinotajs_articles.zip ===
INFO:root:
=== Processing zip: australijas_latvietis_articles.zip ===
INFO:root:
=== Processing zip: avangards_daugavpils_articles.zip ===
INFO:root:
=== Processing zip: avots_articles.zip ===
INFO:root:
=== Processing zip: avots_latvijas_rakstnieku_savienibas_zurnals_articles.zip ===
INFO:root:
=== Processing zip: baltijas_vestnesis_articles.zip ===
INFO:root:
=== Processing zip: baltija_articles.zip ===
INFO:root:
=== Processing zip: baznicas_zinas_articles.zip ===
INFO:root:
=== Processing zip: berniba_articles.zip ===
INFO:root:
=== Processing zip: briva_latvija_apvienota_londonas_avize_un_latvija_articles.zip ===
INFO:root:
=== Processing zip: briva_zeme_articles.zip ===
INFO:root:
=== Processing zip: briviba_articles.zip ===
INFO:root:
=== Processing zip: burtnieks_articles.zip ===
INFO:root:
=== Processing zip: cela_zimes_articles.zip ===
INFO:root:
=== Processing zip: cina_articles.zip ===
INFO:root:
==

                                           file_name  number_of_articles  \
0           adelaides_latviesu_zinotajs_articles.zip                 726   
1                 australijas_latvietis_articles.zip               93180   
2                  avangards_daugavpils_articles.zip               77968   
3                                 avots_articles.zip                7568   
4  avots_latvijas_rakstnieku_savienibas_zurnals_a...                1189   

   number_of_issues  min_year  max_year  years_active_count  min_file_size  \
0                80      1962      2003                  12            0.0   
1                 0      <NA>      <NA>                   0            0.0   
2              4640      1962      1992                  31            0.0   
3               549      1905      1915                  11            0.0   
4                66      1987      1992                   6            0.0   

   max_file_size                                   year_issue_count  
0   

In [28]:
import zipfile
from pathlib import Path
import re
import statistics
import logging
from typing import Dict, Union, List

# Set up logging configuration
logging.basicConfig(level=logging.INFO)  # Change to DEBUG for detailed output

def get_article_stats(zip_file: Union[str, Path], debug: bool = False) -> Dict[str, Union[int, float, List[int]]]:
    """Extract and compute statistics from a given zip file containing articles."""
    # Initialize the statistics dictionary
    article_stats_dict = {
        "number_of_articles": 0,
        "number_of_issues": 0,
        "min_year": None,
        "max_year": None,
        "year_issue_count": {},  
        "years_active": [],       
        "min_file_size": None,
        "max_file_size": None,
        "median_file_size": None,
        "average_file_size": None,
    }

    try:
        # Open the zip file
        with zipfile.ZipFile(zip_file, "r") as zip_ref:
            file_list = zip_ref.namelist()
            article_stats_dict["number_of_articles"] = len(file_list)

            if debug:
                logging.debug(f"Processing zip file: {zip_file}")
                logging.debug(f"First 5 files in zip: {file_list[:5]}")

            if article_stats_dict["number_of_articles"] == 0:
                return article_stats_dict

            # Initialize variables for statistics
            years_active = set()
            publication_issue_set = set()
            file_size_list = []

            # Compile a more flexible regex pattern for partial matching
            pattern = re.compile(r"(?P<publication>[a-z]+).*?(?P<year>\d{4}).*?n(?P<issue1>\d{1,3})(?:-(?P<issue2>\d{1,3}))?_\d{3}.*\.txt")

            for file in file_list:
                # Collect file sizes
                file_size = round(zip_ref.getinfo(file).file_size / 1024, 3)
                file_size_list.append(file_size)

                # Attempt partial matching with regex
                match = pattern.search(file)
                if match:
                    pub_name = match.group('publication')
                    year = int(match.group('year'))
                    issue_start = int(match.group('issue1'))
                    issue_end = int(match.group('issue2')) if match.group('issue2') else issue_start

                    if debug and len(publication_issue_set) < 5:
                        logging.debug(f"Parsed file: {file} -> Publication: {pub_name}, Year: {year}, Issue Range: {issue_start}-{issue_end}")

                    years_active.add(year)

                    for issue in range(issue_start, issue_end + 1):
                        publication_issue_set.add((pub_name, year, issue))
                        article_stats_dict["year_issue_count"][year] = article_stats_dict["year_issue_count"].get(year, 0) + 1
                elif debug and len(publication_issue_set) < 5:
                    logging.debug(f"No partial match found for file: {file}")

            # Compute file size statistics
            if file_size_list:
                article_stats_dict["min_file_size"] = min(file_size_list)
                article_stats_dict["max_file_size"] = max(file_size_list)
                article_stats_dict["average_file_size"] = round(sum(file_size_list) / len(file_size_list), 3)
                article_stats_dict["median_file_size"] = round(statistics.median(file_size_list), 3)

            # Calculate aggregated statistics
            article_stats_dict["number_of_issues"] = len(publication_issue_set)
            article_stats_dict["min_year"] = min(years_active) if years_active else None
            article_stats_dict["max_year"] = max(years_active) if years_active else None
            article_stats_dict["years_active"] = sorted(years_active)

            if debug:
                logging.debug(f"Years Active (first 5): {list(years_active)[:5]}")
                logging.debug(f"Unique publication issues (first 5): {list(publication_issue_set)[:5]}")
                logging.debug(f"Year Issue Count: {dict(list(article_stats_dict['year_issue_count'].items())[:5])}")

    except zipfile.BadZipFile:
        logging.error(f"The file {zip_file} is not a valid zip file.")
    except Exception as e:
        logging.error(f"An error occurred while processing {zip_file}: {e}")

    return article_stats_dict


In [29]:
import pandas as pd  # Import pandas

def get_stats_on_articles(src_folder: Union[str, Path], debug: bool = False) -> Dict[str, Dict[str, Union[int, float, List[int]]]]:
    """Process all zip files in the specified folder and return their statistics."""
    article_dict = {}
    
    for file in Path(src_folder).glob("*.zip"):
        logging.info(f"\n=== Processing zip: {file.name} ===")
        article_dict[file.name] = get_article_stats(file, debug)
        
    return article_dict

def save_stats_to_csv(article_dict: Dict[str, Dict[str, Union[int, float, List[int]]]], output_file: str):
    """Convert article_dict to a DataFrame and save it as a CSV file."""
    # Flatten the dictionary into a list of rows for the DataFrame
    rows = []
    for file_name, stats in article_dict.items():
        # Create a row for each zip file's statistics
        row = {
            "file_name": file_name,
            "number_of_articles": stats["number_of_articles"],
            "number_of_issues": stats["number_of_issues"],
            "min_year": int(stats["min_year"]) if stats["min_year"] is not None else None,
            "max_year": int(stats["max_year"]) if stats["max_year"] is not None else None,
            "years_active_count": len(stats["years_active"]),  # Count of active years
            "min_file_size": stats["min_file_size"],
            "max_file_size": stats["max_file_size"],
            "average_file_size": stats["average_file_size"],
            "median_file_size": stats["median_file_size"],
            "year_issue_count": str(stats["year_issue_count"]),  # Convert dict to string for CSV
        }
        rows.append(row)

    # Create a DataFrame from the list of rows
    df = pd.DataFrame(rows)

    # Write the DataFrame to a CSV file
    df.to_csv(output_file, index=False)

    # Print the head of the DataFrame for quick verification
    print(df.head())


if __name__ == "__main__":
    src_folder = r"I:\zips\articles"  # Specify the source folder
    stats = get_stats_on_articles(src_folder, debug=False)  # Get stats for articles
    output_file = r"I:\zips\article_stats.csv"  # Specify the output CSV file path
    save_stats_to_csv(stats, output_file)  # Save the DataFrame to a CSV and print the



INFO:root:
=== Processing zip: adelaides_latviesu_zinotajs_articles.zip ===
INFO:root:
=== Processing zip: australijas_latvietis_articles.zip ===
INFO:root:
=== Processing zip: avangards_daugavpils_articles.zip ===
INFO:root:
=== Processing zip: avots_articles.zip ===
INFO:root:
=== Processing zip: avots_latvijas_rakstnieku_savienibas_zurnals_articles.zip ===
INFO:root:
=== Processing zip: baltijas_vestnesis_articles.zip ===
INFO:root:
=== Processing zip: baltija_articles.zip ===
INFO:root:
=== Processing zip: baznicas_zinas_articles.zip ===
INFO:root:
=== Processing zip: berniba_articles.zip ===
INFO:root:
=== Processing zip: briva_latvija_apvienota_londonas_avize_un_latvija_articles.zip ===
INFO:root:
=== Processing zip: briva_zeme_articles.zip ===
INFO:root:
=== Processing zip: briviba_articles.zip ===
INFO:root:
=== Processing zip: burtnieks_articles.zip ===
INFO:root:
=== Processing zip: cela_zimes_articles.zip ===
INFO:root:
=== Processing zip: cina_articles.zip ===
INFO:root:
==

                                           file_name  number_of_articles  \
0           adelaides_latviesu_zinotajs_articles.zip                 726   
1                 australijas_latvietis_articles.zip               93180   
2                  avangards_daugavpils_articles.zip               77968   
3                                 avots_articles.zip                7568   
4  avots_latvijas_rakstnieku_savienibas_zurnals_a...                1189   

   number_of_issues  min_year  max_year  years_active_count  min_file_size  \
0                80    1962.0    2003.0                  12            0.0   
1                 0       NaN       NaN                   0            0.0   
2              4640    1962.0    1992.0                  31            0.0   
3               549    1905.0    1915.0                  11            0.0   
4                66    1987.0    1992.0                   6            0.0   

   max_file_size  average_file_size  median_file_size  \
0          7.533 