Installs everything that is needed

In [1]:

! pip install numpy==1.26.4 pandas==2.2.2
! pip install spacy
! python -m spacy download ro_core_news_lg
! pip install matplotlib

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas==2.2.2
  Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas==2.2.2)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas==2.2.2)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [pandas]2m3/4[0m [pandas]
[1A[2KSuccessfully installed numpy-1.26.4 pandas-2.2.2 py

Initialise libraries for Analysis

In [4]:
import os
import pandas as pd
import spacy
import numpy as np
from collections import Counter, defaultdict
from pathlib import Path
import json, csv, re, sys

Load SpaCy Ro Model

In [3]:
spacy_model = spacy.load('ro_core_news_lg')

## Load files

We will load all files. They are stored in subfolders, creating smaller and smaller categories.

In [3]:
ROOT = Path("data")

for fp in ROOT.rglob("*.json"): 
    try: 
        data = json.loads(fp.read_text(encoding="utf-8"))
    except Exception as e: 
        print(f"[err] {fp}: {e}")
        continue

    # get folder path for categories

    rel_path = fp.relative_to(ROOT)
    category_path = rel_path.parent.as_posix()
    filename = fp.name

    print (f"[*] {filename} -> {category_path}")


[*] Anglia_Mea_0482.json -> int/UK
[*] Anglia_Mea_0398.json -> int/UK
[*] Anglia_Mea_0330.json -> int/UK
[*] Anglia_Mea_0238.json -> int/UK
[*] Anglia_Mea_0195.json -> int/UK
[*] Anglia_Mea_0451.json -> int/UK
[*] Anglia_Mea_0417.json -> int/UK
[*] Anglia_Mea_0374.json -> int/UK
[*] Anglia_Mea_0324.json -> int/UK
[*] Anglia_Mea_0060.json -> int/UK
[*] Anglia_Mea_0315.json -> int/UK
[*] Anglia_Mea_0341.json -> int/UK
[*] Anglia_Mea_0333.json -> int/UK
[*] Anglia_Mea_0014.json -> int/UK
[*] Anglia_Mea_0221.json -> int/UK
[*] Anglia_Mea_0062.json -> int/UK
[*] Anglia_Mea_0007.json -> int/UK
[*] Anglia_Mea_0319.json -> int/UK
[*] Anglia_Mea_0289.json -> int/UK
[*] Anglia_Mea_0325.json -> int/UK
[*] Anglia_Mea_0176.json -> int/UK
[*] Anglia_Mea_0465.json -> int/UK
[*] Anglia_Mea_0124.json -> int/UK
[*] Anglia_Mea_0005.json -> int/UK
[*] Anglia_Mea_0050.json -> int/UK
[*] Anglia_Mea_0271.json -> int/UK
[*] Anglia_Mea_0354.json -> int/UK
[*] Anglia_Mea_0058.json -> int/UK
[*] Anglia_Mea_0488.

Script care genereaza doua fisiere .CSV incapsuland dimensiunile datelor. 

Per total
Per categorie
Per subcategorie
Per judet
Per gazeta

In [None]:

ROOT = Path("data")
OUT_CSV = "date_agregate.csv"

word_re = re.compile(r"\w+", flags=re.UNICODE)

"""
    Yield path-uri cumulative ale parintilor, inclusiv gazeta, pentru crearea path-ului complet
    Example:
        relative_parent = 'politics/europe'
        gazeta = 'GAZETA'
        -> '(root)', 'politics', 'politics/europe', 'politics/europe/GAZETA'
"""
def ancestors_with_gazeta(relative_parent: Path, gazeta: str):
    
    yield "(root)"
    parts = list(relative_parent.parts)
    acc = []
    for p in parts:
        acc.append(p)
        yield "/".join(acc)
    if gazeta:
        yield "/".join(acc + [gazeta]) if acc else gazeta

"""
    Metricile pe care le folosim pentru fisiere
"""
def content_metrics(s: str):
    if not isinstance(s, str):
        return 0, 0
    s = s.strip()
    return len(s), len(word_re.findall(s))

folder_stats = defaultdict(lambda: {"files": 0, "chars": 0, "words": 0})

processed, skipped = 0, 0

for fp in ROOT.rglob("*.json"):
    try:
        obj = json.loads(fp.read_text(encoding="utf-8"))
    except Exception as e:
        print(f"[skip] {fp}: {e}")
        skipped += 1
        continue

    content = obj.get("content", "")
    chars, words = content_metrics(content)

    # prefix from filename (e.g. "GAZETA" from "GAZETA_123.json")
    fname = fp.name
    gazeta = fname.split("_", 1)[0]

    rel_parent = fp.parent.relative_to(ROOT)

    # Update all ancestor levels + gazeta leaf
    for cat in ancestors_with_gazeta(rel_parent, gazeta):
        folder_stats[cat]["files"]  += 1
        folder_stats[cat]["chars"]  += chars
        folder_stats[cat]["words"]  += words

    processed += 1

# --- write CSV ---
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["folder_path", "level", "files", "total_content_chars", "total_content_words"])
    def sort_key(item):
        path = item[0]
        depth = 0 if path == "(root)" else path.count("/") + 1
        return (depth, path)
    for path, stats in sorted(folder_stats.items(), key=sort_key):
        level = 0 if path == "(root)" else path.count("/") + 1
        w.writerow([path, level, stats["files"], stats["chars"], stats["words"]])

print(f"Processed: {processed} files; Skipped: {skipped}")
print(f"Saved: {OUT_CSV}")


Processed: 35015 files; Skipped: 0
Saved: folder_aggregates.csv


Genereaza un fisier care marcheaza nr. de fisiere fara continut (empty_content.csv)

Genereaza un fisier care marcheaza nr. de gazete ptr care continutul este identic in procent mai mare de 50%

In [None]:
import hashlib

ROOT = Path("data")
OUT_EMPTY_CSV = "empty_content.csv"
OUT_DUP_CSV   = "gazeta_duplicates.csv"

"Creaza un hash al continutului pentru duplicate detection"
def content_hash(s: str): 
    if not isinstance(s, str):
        return ""
    return hashlib.sha1(s.strip().encode("utf-8")).hexdigest()

empty_files = []
gazeta_hashes = defaultdict(list) 
gazeta_empty_count = Counter()  

processed, skipped = 0, 0

for fp in ROOT.rglob("*.json"):
    try:
        obj = json.loads(fp.read_text(encoding="utf-8"))
    except Exception as e:
        print(f"[skip] {fp}: {e}")
        skipped += 1
        continue

    fname = fp.name
    gazeta = fname.split("_", 1)[0]
    rel_parent = fp.parent.relative_to(ROOT)
    rel_path = fp.relative_to(ROOT).as_posix()

    content = obj.get("content", "")
    chars, words = content_metrics(content)

    # continut gol
    if words == 0:
        empty_files.append([rel_path, gazeta, fname])
        gazeta_empty_count[gazeta] += 1

    # continut pentru duplicate analysis
    h = content_hash(content)
    gazeta_hashes[gazeta].append(h)

    processed += 1

# scriem fisierele care nu au continut
with open(OUT_EMPTY_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["relative_path", "gazeta", "filename"])
    w.writerows(empty_files)

# analizam si cautam gazete care au mai mult de 50% duplicate 
gazeta_dupes = []
for gazeta, hashes in gazeta_hashes.items():
    total = len(hashes)
    unique = len(set(hashes))
    dupes = total - unique
    dup_ratio = dupes / total if total else 0
    if dup_ratio > 0.5: 
        gazeta_dupes.append([gazeta, total, unique, dupes, f"{dup_ratio:.2%}"])

with open(OUT_DUP_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["gazeta", "total_files", "unique_contents", "duplicate_files", "dup_ratio"])
    w.writerows(gazeta_dupes)

print(f"Processed: {processed} files; Skipped: {skipped}")
print(f"Saved: {OUT_EMPTY_CSV}, {OUT_DUP_CSV}")


Processed: 35015 files; Skipped: 0
Saved: empty_content.csv, gazeta_duplicates.csv


In [8]:
all_hashes = []
total_files = 0
total_empty = 0
for gazeta, hashes in gazeta_hashes.items():
    total_files += len(hashes)
    all_hashes.extend(hashes)
    total_empty += gazeta_empty_count[gazeta]

unique_all = len(set(all_hashes))
unique_nonempty = unique_all - (1 if total_empty > 0 else 0)
ok_files = unique_nonempty
ok_ratio = (ok_files / total_files) if total_files else 0.0

with open("dataset_quality.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow([
        "total_files",
        "empty_files",
        "unique_all_incl_empty",
        "unique_nonempty",
        "ok_files",
        "ok_ratio"
    ])
    w.writerow([total_files, total_empty, unique_all, unique_nonempty, ok_files, f"{ok_ratio:.2%}"])