In [1]:
def load_data(data_file, schema_file):
    col_names = []
    col_types = []

    with open(schema_file, "r") as f:
        for ix, line in enumerate(f.read().strip("\n").split("\n")):
            el = line.split("\t")
            if len(el) != 2:
                raise Exception(f"Error while parsing chema file '{schema_file}' at line {ix + 1}, expected 2 values, but found {el}")
            
            name, type = el

            if len(name) == 0:
                raise Exception(f"Error while parsing chema file '{schema_file}' at line {ix + 1}, empty name")
            if type not in ["str", "num"]:
                raise Exception(f"Error while parsing chema file '{schema_file}' at line {ix + 1}, unknown type '{type}'. Possible types: 'str', 'num'")

            col_names.append(name)
            col_types.append(type)
    
    data = []

    with open(data_file, "r", encoding="utf-8") as f:
        for ix, line in enumerate(f.read().strip("\n").split("\n")):
            el = line.split("\t")

            if len(el) != len(col_names):
                raise Exception(f"Error while parsing data file '{data_file}' at line {ix + 1}, expected {len(col_names)} values, but found {len(el)}")
            
            entry = {}

            for i in range(len(el)):
                if col_types[i] == "num":
                    try:
                        entry[col_names[i]] = float(el[i].strip())
                    except:
                        entry[col_names[i]] = None
                else:
                    entry[col_names[i]] = el[i].strip()
            
            data.append(entry)

    return data

In [2]:
schema_files = {
    2014: "meta/schema-dgov.txt",
    2015: "meta/schema-dgov.txt",
    2016: "meta/schema-dgov.txt",
    2017: "meta/schema-dgov.txt",
    2018: "meta/schema-dgov.txt",
    2019: "meta/schema-edu-raport.txt",
    2020: "meta/schema-edu-raport.txt",
    2021: "meta/schema-edu-initial.txt",
    2022: "meta/schema-edu-initial.txt",
    2023: "meta/schema-edu-initial.txt",
}

data = {}

for year in range(2014, 2024):
    data[year] = load_data(f"data/bac/{year}.csv", schema_files[year])
    print(f"Loaded {len(data[year])} entries for year {year}")

Loaded 161682 entries for year 2014
Loaded 168939 entries for year 2015
Loaded 137338 entries for year 2016
Loaded 135513 entries for year 2017
Loaded 136864 entries for year 2018
Loaded 136091 entries for year 2019
Loaded 155750 entries for year 2020
Loaded 133664 entries for year 2021
Loaded 126453 entries for year 2022
Loaded 130522 entries for year 2023


In [3]:
siiir = load_data("meta/siiir.csv", "meta/siiir-schema.csv")

print(len(siiir))
print(siiir[0])

18434
{'nr_crt': 1.0, 'cod_siiir': '0261205195', 'nume_unitate': 'ASOCIAȚIA CREȘTINĂ DE CARITATE SAMARITEANUL - GRADINITA SAMARITEANUL AGRIȘU MARE', 'denumire': 'GRADINITA SAMARITEANUL AGRIȘU MARE', 'oras': 'AGRIŞU MARE', 'localitate': 'TÂRNOVA', 'judet': 'ARAD', 'stare_legala': 'Cu personalitate juridică', 'tip_unitate': 'Unitate de învățământ', 'administrare': 'Privată'}


In [16]:
sirues = load_data("meta/sirues.csv", "meta/sirues-schema.csv")

print(len(sirues))
print(sirues[0])

7453
{'nr_crt': 1.0, 'judet': 'Alba', 'nume_unitate': 'LICEUL TEHNOLOGIC AGRICOL "ALEXANDRU BORZA" CIUMBRUD', 'cod_sirues': '844547', 'clasificare_1': 'Unitate de învăţământ', 'clasificare_2': 'Grup şcolar agricol', 'clasificare_3': 'Grup şcolar', 'clasificare_4': 'Unitate de învăţământ', 'clasificare_5': 'Grup şcolar agricol'}


In [5]:

def clear_diacritics(s):
    mappings = [
        ('Á', 'A'),
        ('Â', 'A'),
        ('É', 'E'),
        ('Î', 'I'),
        ('Ó', 'O'),
        ('Ö', 'O'),
        ('Ă', 'A'),
        ('Ő', 'O'),
        ('Ş', 'S'),
        ('Ţ', 'T'),
        ('Ș', 'S'),
        ('Ț', 'T'),
    ]
    for a, b in mappings:
        s = s.replace(a, b)
        s = s.replace(a.lower(), b.lower())
    return s

In [19]:


def get_county_code(county_name):
    counties = [('AB', 'ALBA'), ('AG', 'ARGES'), ('AR', 'ARAD'), ('B', 'BUCURESTI'), ('BC', 'BACAU'), ('BH', 'BIHOR'), ('BN', 'BISTRITA'), ('BR', 'BRAILA'), ('BT', 'BOTOSANI'), ('BV', 'BRASOV'), ('BZ', 'BUZAU'), ('CJ', 'CLUJ'), ('CL', 'CALARASI'), ('CS', 'CARAS'), ('CT', 'CONSTANTA'), ('CV', 'COVASNA'), ('DB', 'DAMBOVITA'), ('DJ', 'DOLJ'), ('GJ', 'GORJ'), ('GL', 'GALATI'), ('GR', 'GIURGIU'), ('HD', 'HUNEDOARA'), ('HR', 'HARGHITA'), ('IF', 'ILFOV'), ('IL', 'IALOMITA'), ('IS', 'IASI'), ('MH', 'MEHEDINTI'), ('MM', 'MARAMURES'), ('MS', 'MURES'), ('NT', 'NEAMT'), ('OT', 'OLT'), ('PH', 'PRAHOVA'), ('SB', 'SIBIU'), ('SJ', 'SALAJ'), ('SM', 'SATU'), ('SV', 'SUCEAVA'), ('TL', 'TULCEA'), ('TM', 'TIMIS'), ('TR', 'TELEORMAN'), ('VL', 'VALCEA'), ('VN', 'VRANCEA'), ('VS', 'VASLUI')]
    name = clear_diacritics(county_name.upper())

    for code, county in counties:
        if county in name:
            return code

    raise Exception(f"Unknown county {name}")

def process_dot_gov(data, siiir, sirues):
    school_name_by_siiir = {
        int(el["cod_siiir"]): el["nume_unitate"] for el in siiir
    }

    county_by_siiir = {
        int(el["cod_siiir"]): get_county_code(el["judet"]) for el in siiir
    }

    school_name_by_sirues = {
        int(el["cod_sirues"]): el["nume_unitate"] for el in sirues if el["cod_sirues"] != ""
    }

    county_by_sirues = {
        int(el["cod_sirues"]): get_county_code(el["judet"]) for el in sirues if el["cod_sirues"] != ""
    }

    ignored = 0

    for el in data:
        if int(el["siiir"]) in school_name_by_siiir:
            el["liceu"] = school_name_by_siiir[int(el["siiir"])]
            el["judet"] = county_by_siiir[int(el["siiir"])]
        elif el["sirues"] != "" and int(el["sirues"]) in school_name_by_sirues:
            el["liceu"] = school_name_by_sirues[int(el["sirues"])]
            el["judet"] = county_by_sirues[int(el["sirues"])]
        else:
            el["liceu"] = None
            el["judet"] = None
            ignored += 1
    
    print(f"Ignored {ignored}/{len(data)} entries")

for year in range(2014, 2019):
    process_dot_gov(data[year], siiir, sirues)

    

import re


def canonicalize(entry) -> str:
    liceu = entry["liceu"]
    judet = entry["judet"]

    if liceu is None:
        return ""

    liceu = liceu.replace("Ăˇ", "Á")
    liceu = liceu.replace("Ă©", "É")
    liceu = liceu.replace("Ĺ‘", "Ő").replace("Ă¶", "Ö").replace("Ăł", "Ó")
    liceu = liceu.replace("â€™", "'").replace("Â€™", "'")
    liceu = liceu.replace("’", "'").replace("‘", "'")
    liceu = liceu.replace("''", '"').replace(",,", '"').replace("„", '"').replace("”", '"').replace("“", '"').replace('""', '"')
    liceu = liceu.replace("'", '"')
    if liceu.count('"') != 0 and liceu.count('"') != 2:
        liceu = liceu.replace('"', "")
    liceu = liceu.replace("_", " ")
    liceu = liceu.replace("Ş", "Ș").replace("Ţ", "Ț").replace("ş", "ș").replace("ţ", "ț")

    liceu = liceu.upper()
    
    # forbidden = [' ', '"', '(', ')', ',', '-', '.']
    # for f in forbidden:
    #     liceu = liceu.replace(f, " ")

    # 'Á', 'Â', 'É', 'Î', 'Ó', 'Ö', 'â', 'Ă', 'ă', 'Ő', 'Ş', 'ş', 'Ţ', 'ţ', 'Ș', 'ș', 'Ț', 'ț'
    # only uppercase
    liceu = clear_diacritics(liceu)
    
    liceu = re.sub(r" +", " ", liceu)
    liceu = liceu.strip()
    

    # if liceu.count('"') != 0:
    #     # with 1672, without 1675
    #     l = liceu.split('"')
    #     x = re.sub(r"[^A-Z]+", " ", l[1]).strip()
    #     x = x.split(" ")
    #     x = '.'.join([el[0] for el in x[:-1]] + [x[-1]])
    #     liceu = '"'.join([l[0], x, l[2]])
    
    # if liceu.count('"') != 0 or liceu.count(",") != 0:
    #     last_pos = max(liceu.rfind('"') + 1, liceu.rfind(","))
    #     liceu = liceu[:last_pos] + " " + judet

    liceu = liceu + ", " + judet

    return liceu

total_licee = set()
years = {}
for year in range(2014, 2024):
    # Test for canonicalization collisions
    # unique = {}
    # for el in data[year]:
    #     if el["liceu"] is None:
    #         continue
    #     liceu = canonicalize(el)
    #     raw_liceu = el["liceu"] + " " + el["judet"]
    #     if liceu in unique:
    #         if raw_liceu not in unique[liceu]:
    #             print(f"Collision: {liceu} -> {unique[liceu]} vs {raw_liceu}")
    #             unique[liceu].append(raw_liceu)
    #     else:
    #         unique[liceu] = [raw_liceu]
    for el in data[year]:
        if el["liceu"] is None:
            continue
        liceu = canonicalize(el)
        if liceu not in years:
            years[liceu] = [year]
        if years[liceu][-1] != year:
            years[liceu].append(year)

    charset = sorted(set(''.join([canonicalize(el) for el in data[year]])))
    print(f"Found {len(charset)} unique characters for year {year}: {charset}")
    licee = set([canonicalize(el) for el  in data[year]])
    
    # if year == 2014:
    #     f = open("liceu-2014.txt", "w")
    #     for l in licee:
    #         f.write(l + "\n")
    #     f.close()
    # if year == 2015:

    total_licee = total_licee.union(licee)
    print(f"Found {len(licee)} licee for year {year}, total {len(total_licee)}")
    # print(f"New licee for year {year}:")
    # if year > 2014:
    #     for l in licee:
    #         if l not in total_licee:
    #             print(l)

for liceu in years:
    y = years[liceu]
    for i in range(1, len(y)):
        if y[i] - y[i - 1] != 1:
            print(f"Missing year for {liceu}: {y}")
# 1672
f = open("liceu-2014.txt", "w")
for l in licee:
    f.write(l + "\n")
f.close()

Ignored 295/161682 entries
Ignored 254/168939 entries
Ignored 195/137338 entries
Ignored 152/135513 entries
Ignored 40/136864 entries
Found 34 unique characters for year 2014: [' ', '"', ',', '-', '.', '1', '2', '3', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
Found 1518 licee for year 2014, total 1518
Found 34 unique characters for year 2015: [' ', '"', ',', '-', '.', '1', '2', '3', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
Found 1505 licee for year 2015, total 1537
Found 34 unique characters for year 2016: [' ', '"', ',', '-', '.', '1', '2', '3', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
Found 1499 licee for year 2016, total 1544
Found 34 unique characters for year 2017: [' ', '"', ',', '-', '.', '1', '2', '3', 'A', 'B', 'C', 

In [93]:
f = open("output.txt", "w", encoding="utf-8")

for liceu in total_licee:
    # if "." in liceu:
    f.write(f'{liceu}\n')

f.close()

In [100]:
raw = [("AB","Alba"),
("AG","Argeș"),
("AR","Arad"),
("B","București"),
("BC","Bacău"),
("BH","Bihor"),
("BN","Bistrița-Năsăud"),
("BR","Brăila"),
("BT","Botoșani"),
("BV","Brașov"),
("BZ","Buzău"),
("CJ","Cluj"),
("CL","Călărași"),
("CS","Caraș-Severin"),
("CT","Constanța"),
("CV","Covasna"),
("DB","Dâmbovița"),
("DJ","Dolj"),
("GJ","Gorj"),
("GL","Galați"),
("GR","Giurgiu"),
("HD","Hunedoara"),
("HR","Harghita"),
("IF","Ilfov"),
("IL","Ialomița"),
("IS","Iași"),
("MH","Mehedinți"),
("MM","Maramureș"),
("MS","Mureș"),
("NT","Neamț"),
("OT","Olt"),
("PH","Prahova"),
("SB","Sibiu"),
("SJ","Sălaj"),
("SM","Satu-Mare"),
("SV","Suceava"),
("TL","Tulcea"),
("TM","Timiș"),
("TR","Teleorman"),
("VL","Vâlcea"),
("VN","Vrancea"),
("VS","Vaslui"),
]

for i in range(len(raw)):
    x = raw[i][1]
    x = x.upper()
    x = x.split("-")[0]
    x = clear_diacritics(x)
    raw[i] = (raw[i][0], x)

    for j in range(i):
        if raw[i][1] in raw[j][1] or raw[j][1] in raw[i][1]:
            print(raw[i], raw[j])

print(raw)

('MS', 'MURES') ('MM', 'MARAMURES')
[('AB', 'ALBA'), ('AG', 'ARGES'), ('AR', 'ARAD'), ('B', 'BUCURESTI'), ('BC', 'BACAU'), ('BH', 'BIHOR'), ('BN', 'BISTRITA'), ('BR', 'BRAILA'), ('BT', 'BOTOSANI'), ('BV', 'BRASOV'), ('BZ', 'BUZAU'), ('CJ', 'CLUJ'), ('CL', 'CALARASI'), ('CS', 'CARAS'), ('CT', 'CONSTANTA'), ('CV', 'COVASNA'), ('DB', 'DAMBOVITA'), ('DJ', 'DOLJ'), ('GJ', 'GORJ'), ('GL', 'GALATI'), ('GR', 'GIURGIU'), ('HD', 'HUNEDOARA'), ('HR', 'HARGHITA'), ('IF', 'ILFOV'), ('IL', 'IALOMITA'), ('IS', 'IASI'), ('MH', 'MEHEDINTI'), ('MM', 'MARAMURES'), ('MS', 'MURES'), ('NT', 'NEAMT'), ('OT', 'OLT'), ('PH', 'PRAHOVA'), ('SB', 'SIBIU'), ('SJ', 'SALAJ'), ('SM', 'SATU'), ('SV', 'SUCEAVA'), ('TL', 'TULCEA'), ('TM', 'TIMIS'), ('TR', 'TELEORMAN'), ('VL', 'VALCEA'), ('VN', 'VRANCEA'), ('VS', 'VASLUI')]
