# EIS Collection Analysis

Load collected Parquet tables from `data/processed/` and summarize the
attachments and attribute coverage.

In [1]:
from pathlib import Path
import re
import pandas as pd


def resolve_processed_base() -> Path:
    cwd = Path.cwd().resolve()
    candidates = [
        cwd / "data" / "processed",
        cwd / "processed",
    ]
    for parent in [cwd, *cwd.parents]:
        candidates.append(parent / "data" / "processed")
    for candidate in candidates:
        if candidate.exists():
            return candidate
    raise FileNotFoundError("Could not locate data/processed directory from cwd")


def extract_run_id(path: Path) -> int | None:
    match = re.search(r"run_(\d+)_", path.name)
    if not match:
        return None
    return int(match.group(1))


def latest_run_id(path: Path) -> int | None:
    run_ids = [extract_run_id(p) for p in path.glob("*.parquet")]
    run_ids = [rid for rid in run_ids if rid is not None]
    return max(run_ids) if run_ids else None


def read_run_parquets(path: Path, run_id: int | None) -> pd.DataFrame:
    if run_id is None:
        files = list(path.glob("*.parquet"))
    else:
        files = list(path.glob(f"*run_{run_id}_*.parquet"))
    if not files:
        return pd.DataFrame()
    frames = [pd.read_parquet(file) for file in sorted(files)]
    return pd.concat(frames, ignore_index=True)


base = resolve_processed_base()
print("Using processed base:", base)

run_id = latest_run_id(base / "guarantees")
print("Latest run_id:", run_id)

guarantees = read_run_parquets(base / "guarantees", run_id)
attributes = read_run_parquets(base / "attributes", run_id)
files = read_run_parquets(base / "files", run_id)

print("guarantees rows", len(guarantees))
print("attributes rows", len(attributes))
print("files rows", len(files))

display(guarantees.head())
display(attributes.head())
display(files.head())

Using processed base: /Users/home/Work/10-edu/data-science/thesis/code/masters-thesis-dev/data/processed
Latest run_id: 53
guarantees rows 9893
attributes rows 497427
files rows 10378


Unnamed: 0,run_id,id,status,general_url,documents_url,fetched_at,warnings,error
0,53,1691322,OK,https://zakupki.gov.ru/epz/bankguarantee/guara...,https://zakupki.gov.ru/epz/bankguarantee/guara...,2026-02-13T07:34:35.285531+00:00,[],
1,53,1699189,OK,https://zakupki.gov.ru/epz/bankguarantee/guara...,https://zakupki.gov.ru/epz/bankguarantee/guara...,2026-02-13T07:34:45.146331+00:00,[],
2,53,1699068,OK,https://zakupki.gov.ru/epz/bankguarantee/guara...,https://zakupki.gov.ru/epz/bankguarantee/guara...,2026-02-13T07:34:53.677027+00:00,[],
3,53,1698670,OK,https://zakupki.gov.ru/epz/bankguarantee/guara...,https://zakupki.gov.ru/epz/bankguarantee/guara...,2026-02-13T07:35:03.546578+00:00,[],
4,53,1699847,OK,https://zakupki.gov.ru/epz/bankguarantee/guara...,https://zakupki.gov.ru/epz/bankguarantee/guara...,2026-02-13T07:35:13.711238+00:00,[],


Unnamed: 0,run_id,id,section,field_name,field_value,document_index,document_number
0,53,1691322,Сводная информация (верхний блок),Статус,Размещено,,
1,53,1691322,Сводная информация (верхний блок),Банк-гарант,"""АКЦИОНЕРНЫЙ КОММЕРЧЕСКИЙ БАНК ""ДЕРЖАВА"" ПУБЛИ...",,
2,53,1691322,Сводная информация (верхний блок),ИНН,7729003482,,
3,53,1691322,Сводная информация (верхний блок),КПП,770401001,,
4,53,1691322,Сводная информация (верхний блок),Номер извещения об осуществлении закупки,0172200003118000032,,


Unnamed: 0,run_id,id,file_index,stored_filename,stored_path,original_filename,download_url,document_index,document_number,page_count,mime_type,download_status,sha256
0,53,1691322,1,1691322_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,186165.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,1,0B62783200007618000401,2,application/pdf,DOWNLOADED,ac42cdb358e368da3dd590e677bde7b9d7768bcb7eec27...
1,53,1699189,1,1699189_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,БГ_5120.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,1,01T2380802757718000101,2,application/pdf,DOWNLOADED,d2f1e333ab0463eda901fcbc064dd2502b11df95282462...
2,53,1699068,1,1699068_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,лига групп 15732-21.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,1,0J11910300307018001001,2,application/pdf,DOWNLOADED,f83dfc9cd218d6001c99ef4cc657a4fda1ece3be32f5a0...
3,53,1698670,1,1698670_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,41486.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,1,04R3471002615218000101,2,application/pdf,DOWNLOADED,824928a33dc4471d8f293fb6f58b5849e7d79f5f31546e...
4,53,1699847,1,1699847_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,БГ 4875 от 20.02.2018.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,1,0381370267767018000501,2,application/pdf,DOWNLOADED,524f077e7c96e37ffb61bdbe3478f7a1be62586455c3fd...


In [18]:
# summary for the last run
if files.empty:
    print("No files table found.")
else:
    files = files.copy()
    files["extension"] = files["stored_filename"].str.extract(r"(\.[^\.]+)$", expand=False).fillna("")

    def file_size(path: str) -> int:
        try:
            return Path(path).stat().st_size
        except FileNotFoundError:
            return 0
        except OSError:
            return 0

    files["size_bytes"] = files["stored_path"].apply(file_size)

    print("Download status counts")
    display(files["download_status"].value_counts())

    print("File extensions")
    display(files["extension"].value_counts())

    total_size = int(files["size_bytes"].sum())
    downloaded_size = int(files.loc[files["download_status"].eq("DOWNLOADED"), "size_bytes"].sum())

    print("Total size (bytes)", total_size)
    print("Downloaded size (bytes)", downloaded_size)
    print("Existing files", int((files["size_bytes"] > 0).sum()))

    if "sha256" in files.columns:
        nonempty = files[files["sha256"].astype(str).str.len() > 0]
        dupes = nonempty[nonempty.duplicated("sha256", keep=False)]
        if dupes.empty:
            print("No duplicate hashes detected.")
        else:
            print("Duplicate hashes detected")
            display(dupes.sort_values("sha256"))

Download status counts


download_status
DOWNLOADED    20
Name: count, dtype: int64

File extensions


extension
.pdf    20
Name: count, dtype: int64

Total size (bytes) 35095657
Downloaded size (bytes) 35095657
Existing files 20
Duplicate hashes detected


Unnamed: 0,run_id,id,file_index,stored_filename,stored_path,original_filename,download_url,document_index,document_number,page_count,mime_type,download_status,sha256,extension,size_bytes
7,38,1940011,1,1940011_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,БГ 65865 ВБЦ-18 от 21.06.2018.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,1,0383711300305118000603,2,application/pdf,DOWNLOADED,ae77565070884034178385680ed4ae896e1e1ad54d099b...,.pdf,2344371
8,38,1940011,2,1940011_2.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,БГ 65865 ВБЦ-18 от 21.06.2018.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,2,0383711300305118000601,2,application/pdf,DOWNLOADED,ae77565070884034178385680ed4ae896e1e1ad54d099b...,.pdf,2344371
10,38,1962721,1,1962721_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,Бланк_гар_ПСФ Корт_2216_290618.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,1,02L3781703211518002303,2,application/pdf,DOWNLOADED,bd7f1e44901be948bd28433c919095ea5613d82d26a708...,.pdf,1994414
11,38,1962721,2,1962721_2.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,Бланк_гар_ПСФ Корт_2216_290618.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,2,02L3781703211518002301,2,application/pdf,DOWNLOADED,bd7f1e44901be948bd28433c919095ea5613d82d26a708...,.pdf,1994414


In [3]:
# attributes['field_name'].value_counts()

In [5]:
# guarantees.head()

In [2]:
from pathlib import Path
import hashlib


def read_all_parquets(path: Path) -> pd.DataFrame:
    files = list(path.glob("*.parquet"))
    if not files:
        return pd.DataFrame()
    frames = [pd.read_parquet(file) for file in sorted(files)]
    return pd.concat(frames, ignore_index=True)


guarantees_all = read_all_parquets(base / "guarantees")
attributes_all = read_all_parquets(base / "attributes")
files_all = read_all_parquets(base / "files")

if guarantees_all.empty:
    raise RuntimeError("No guarantees tables found in processed folder.")

if "run_id" not in guarantees_all.columns:
    raise RuntimeError("guarantees table missing run_id; rerun parser with run_id support.")

# Pick the latest run per guarantee ID.
guarantees_latest = (
    guarantees_all.sort_values(["id", "run_id", "fetched_at"]).drop_duplicates("id", keep="last")
)

latest_keys = guarantees_latest[["id", "run_id"]].drop_duplicates()
attributes_latest = attributes_all.merge(latest_keys, on=["id", "run_id"], how="inner")
files_latest = files_all.merge(latest_keys, on=["id", "run_id"], how="inner")


def file_exists(path_value: str) -> bool:
    if not isinstance(path_value, str) or not path_value:
        return False
    return Path(path_value).exists()


files_latest["file_exists"] = files_latest["stored_path"].apply(file_exists)
files_latest_existing = files_latest[files_latest["file_exists"]].copy()

print("Latest guarantees rows", len(guarantees_latest))
print("Latest attributes rows", len(attributes_latest))
print("Latest files rows", len(files_latest))
print("Existing files rows", len(files_latest_existing))

Latest guarantees rows 358238
Latest attributes rows 17708395
Latest files rows 373051
Existing files rows 371963


In [3]:
# these files doesn't exist
files_latest[files_latest['file_exists'] == False]

Unnamed: 0,id,file_index,stored_filename,stored_path,original_filename,download_url,mime_type,download_status,sha256,document_index,document_number,page_count,run_id,file_exists
48,461122,1,461122_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,11122015193909_002.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_TIMEOUT,,1.0,01Q2772339668515000501,0.0,,False
69,119178,1,,,1941.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_NOT_FOUND,,1.0,0420173200004614004101,0.0,14.0,False
132,40710,2,,,Договор БГ 385-53.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_NOT_FOUND,,1.0,05A0141200001514000603,0.0,17.0,False
861,15,1,,,"БГ 018 (01814, 15 млн).jpg",https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_NOT_FOUND,,1.0,0930373100042414000101,0.0,28.0,False
880,66,2,,,гарантия 14103.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_NOT_FOUND,,2.0,0890373200001614000101,0.0,28.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33333,9717,3,,,doc08297020140326154708.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_NOT_FOUND,,1.0,0420127200001114000101,0.0,35.0,False
33334,9835,1,,,ООО Мистстудия.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_NOT_FOUND,,1.0,02H0148300022214000301,0.0,35.0,False
33337,9902,3,,,ДС 2.tif,https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_NOT_FOUND,,1.0,07X0315300000114001401,0.0,35.0,False
33349,9821,3,,,Заявка уралтранс 1.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,,FAILED_NOT_FOUND,,1.0,00P0372200180814000205,0.0,35.0,False


In [4]:
# sample
files_latest_existing[files_latest_existing['id'] == 40710]

Unnamed: 0,id,file_index,stored_filename,stored_path,original_filename,download_url,mime_type,download_status,sha256,document_index,document_number,page_count,run_id,file_exists
131,40710,1,40710_1.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,Договор БГ 385-53.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,application/pdf,DOWNLOADED,e829cda7956f706ea1424c8cd784bf0ea94906553a1708...,1.0,05A0141200001514000603,3.0,17.0,True
133,40710,3,40710_3.pdf,/Users/home/Work/10-edu/data-science/thesis/co...,Договор БГ 385-53.pdf,https://zakupki.gov.ru/44fz/filestore/public/1...,application/pdf,DOWNLOADED,e829cda7956f706ea1424c8cd784bf0ea94906553a1708...,2.0,05A0141200001514000601,3.0,17.0,True


In [5]:
final_dir = base / "final"
final_dir.mkdir(parents=True, exist_ok=True)

guarantees_latest.to_csv(final_dir / "guarantees_latest.csv", index=False)
attributes_latest.to_csv(final_dir / "attributes_latest.csv", index=False)
files_latest_existing.to_csv(final_dir / "files_latest.csv", index=False)

print("Saved CSVs to", final_dir)

Saved CSVs to /Users/home/Work/10-edu/data-science/thesis/code/masters-thesis-dev/data/processed/final


In [6]:
def compute_sha256(path: Path) -> str:
    hasher = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            hasher.update(chunk)
    return hasher.hexdigest()


total_files = len(files_latest_existing)
sha_series = files_latest_existing.get("sha256")
if sha_series is None:
    sha_series = pd.Series([""] * total_files)

sha_series = sha_series.fillna("")
missing_sha = files_latest_existing[sha_series.eq("")]

if not missing_sha.empty:
    computed = []
    for path_value in missing_sha["stored_path"]:
        try:
            computed.append(compute_sha256(Path(path_value)))
        except Exception:
            computed.append("")
    sha_series.loc[missing_sha.index] = computed

unique_files = sha_series[sha_series.ne("")].nunique()

print("Total downloaded files (existing):", total_files)
print("Unique files by sha256:", unique_files)

Total downloaded files (existing): 279685
Unique files by sha256: 253594


In [7]:
# files_latest_existing[files_latest_existing['stored_filename'].str.endswith(".jpg")]

In [8]:
# files_latest_existing[files_latest_existing['stored_filename'].str.endswith(".doc")].head()

### Draw random samples

In [25]:
import numpy as np
np.random.seed(13)
print(np.random.choice(list(range(1, 1962721)), 10), sep=',')

[1540435 1751217 1015883  253457  256743 1737571 1791733 1900698 1860333
 1216443]


In [26]:
import random

# Generate 25 random integers between 1 and 2000000
random_numbers = [str(random.randint(1, 1962721)) for _ in range(1000)]

# Print as a comma-separated string
print(",".join(random_numbers))

632799,1855251,427914,134076,771734,1252185,927954,182960,647662,791601,1527702,165346,330910,1406207,920364,22646,517826,760564,1604391,680228,408552,1791999,1006478,280453,1705872,1485846,1343216,1757698,1221434,23371,471740,848061,1918047,1712632,1907802,89205,1478783,131467,1733332,1835254,1835257,574568,902023,799717,962972,785436,1896702,1504393,837180,1514219,14599,1048194,962218,1884024,111279,1206503,324864,1948850,141638,1227861,1711964,1810707,1252730,469093,125763,674026,1829334,590980,1096111,814434,1863851,94598,694723,982556,666999,1244657,23927,359014,296830,1555730,1394269,1349052,472611,816536,1323696,134057,403195,1033882,1496067,533237,610329,710262,287553,1097956,3867,1158704,673804,1543957,1844381,1491166,650777,1899724,1571107,1698717,498192,200543,282113,688238,552192,1512231,311273,558256,1545165,36860,1716103,908737,1827992,1032447,824100,889860,272652,1305664,1498141,588969,625657,171545,823671,1173043,195180,1234197,602460,1556709,943796,915101,837800,415493