In [5]:
# this code will filter specific data from the zip file with subfolders
# the data will be saved in a new directory created in I:\Viesturs_workfiles\Cina, and new folder will be called "Cina_added-data"
# this main folder will have several subfolders: "cina_1948_1949", "cina_added-missing", "cina-subcorpus"

# first, we fill acquire the certain text files from "I:\zips\articles\cina_articles.zip\cina_articles" folder, saving them to "I:\Viesturs_workfiles\Cina\cina_added-data\cina_1948_1949" folder
# the logic: filenames are constructed by this logic "cina1904n002_001_plaintext_s01.txt", where "1904" is the year, "n002" is the issue number, "001" is the page number, "s01" is the section number
# we need to filter out the files that have year 1948 or 1949, and save them to the folder "cina_1948_1949"
import os
import zipfile
import re
from pathlib import Path

# Define the path to the zip file
zip_file = Path(r"I:\zips\articles\cina_articles.zip")

# Define the path to the folder where we will save the extracted files
save_folder = Path(r"I:\Viesturs_workfiles\Cina\cina_added-data\cina_1948_1949")

# Create the folder if it does not exist
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

# List to hold filenames that do not match the regex pattern
non_matching_files = []

# Updated regex to handle new filename variations, allowing 1-3 digits in issue number
pattern = re.compile(r"cina(\d{4})n\d{1,3}([rR]?)((-\d{1,3})+)?_\d{3}_plaintext_s\d{2}.txt")

# Open the zip file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    # Get the list of files in the zip file
    files = zip_ref.namelist()
    # Iterate over the files
    for file in files:
        # Check if the file is a text file
        if file.endswith(".txt"):
            # Check if the file name matches the expected pattern
            match = pattern.search(file)
            if match:
                year = match.group(1)
                # Check if the year is 1948 or 1949
                if year in ["1948", "1949"]:
                    # Extract and copy the file to the folder
                    with zip_ref.open(file) as source, open(save_folder / os.path.basename(file), 'wb') as target:
                        target.write(source.read())
            else:
                # Add to the list of non-matching files
                non_matching_files.append(file)

# Print the list of non-matching files
print("Files that do not match the regex pattern:")
for non_matching_file in non_matching_files:
    print(non_matching_file)

# For basic counting and check, we can print the number of files in the folder and look at the first 10 files
files = os.listdir(save_folder)
print(f"\nNumber of files in the folder: {len(files)}")
print("First 10 files:", files[:10])


Files that do not match the regex pattern:

Number of files in the folder: 26635
First 10 files: ['cina1948n100_001_plaintext_s01.txt', 'cina1948n100_001_plaintext_s02.txt', 'cina1948n100_001_plaintext_s03.txt', 'cina1948n100_001_plaintext_s04.txt', 'cina1948n100_001_plaintext_s05.txt', 'cina1948n100_001_plaintext_s06.txt', 'cina1948n100_001_plaintext_s07.txt', 'cina1948n100_002_plaintext_s08.txt', 'cina1948n100_002_plaintext_s09.txt', 'cina1948n100_002_plaintext_s10.txt']


In [9]:
import os
import re
from pathlib import Path

# Define the paths to the relevant folders
plaintext_folder = Path(r"I:\Viesturs_workfiles\Cina\cina_added-data\cina_1948_1949")
filtered_folder = Path(r"I:\Viesturs_workfiles\Cina\Cina_izgutie-2709_filtered")

# List to hold filenames that are already in plaintext
files_already_in_plaintext = []

# Preprocess the plaintext folder to extract year and issue combinations
plaintext_files = os.listdir(plaintext_folder)
year_issue_set = set()

# Extract year and issue from the plaintext filenames
for file in plaintext_files:
    match = re.search(r"cina(\d{4})n(\d{1,3})_", file)
    if match:
        year = match.group(1)
        issue_number = match.group(2)
        year_issue_set.add((year, issue_number))

# Print information about the plaintext files
print(f"Total number of plaintext files: {len(plaintext_files)}")
print(f"Unique year-issue combinations in plaintext: {len(year_issue_set)}")

# Set to hold unique years and issues found in the filtered folder
filtered_years = set()
filtered_issues = set()

# Iterate over the folders in the filtered folder
for folder in os.listdir(filtered_folder):
    folder_path = filtered_folder / folder
    if os.path.isdir(folder_path):
        # Iterate over the files in each subfolder
        for file in os.listdir(folder_path):
            # Extract the year and issue number from any matching filename
            match = re.search(r"cina(\d{4})n(\d{1,3})_", file)
            if match:
                year = match.group(1)
                issue_number = match.group(2)
                filtered_years.add(year)
                filtered_issues.add(issue_number)
                # Check if the year and issue number are already in the plaintext files
                if (year, issue_number) in year_issue_set:
                    files_already_in_plaintext.append(file)

# Print the list of files that are already in plaintext
print("\nFiles that are already in plaintext:")
if files_already_in_plaintext:
    for file in files_already_in_plaintext:
        print(file)
else:
    print("None")

# Print information about the filtered folder
print(f"\nUnique years found in filtered folder: {len(filtered_years)}")
print(f"Unique issue numbers found in filtered folder: {len(filtered_issues)}")
print("Unique years:", sorted(filtered_years))
print("Unique issue numbers:", sorted(filtered_issues))


Total number of plaintext files: 26635
Unique year-issue combinations in plaintext: 616

Files that are already in plaintext:
None

Unique years found in filtered folder: 10
Unique issue numbers found in filtered folder: 70
Unique years: ['1904', '1941', '1944', '1945', '1946', '1948', '1949', '1951', '1962', '1964']
Unique issue numbers: ['001', '003', '005', '013', '017', '044', '049', '063', '066', '067', '073', '075', '078', '079', '080', '083', '091', '093', '098', '101', '112', '117', '125', '126', '127', '128', '136', '158', '165', '168', '170', '174', '196', '199', '202', '203', '209', '213', '218', '225', '229', '230', '242', '249', '257', '259', '262', '265', '266', '271', '272', '273', '276', '277', '280', '281', '283', '286', '288', '289', '290', '291', '294', '295', '299', '300', '304', '305', '49', '63']
