In [1]:
"""
Próba normalizacji słowników (bez zapisu do oryginalnych plików):
- importuje src/dictionaries.py
- wczytuje oba CSV
- pokazuje podgląd i podstawowe statystyki
- (opcjonalnie) zapisuje tylko pliki _TEST_*.csv
"""

from pathlib import Path
import pandas as pd

from src.dictionaries import DictSpec, load_dictionary

DICT_SUBJ = Path("/Users/olga/MetaLogic/data/dictionary_subjects.csv")
DICT_SCEN = Path("/Users/olga/MetaLogic/data/dictionary_scenes.csv")

df_subj = load_dictionary(DictSpec(DICT_SUBJ, "subjects"))
df_scen = load_dictionary(DictSpec(DICT_SCEN, "scenes"))

display(df_subj.head(30))
display(df_scen.head(30))

print("SUBJECTS:", len(df_subj), "rows | brak PL:", int((df_subj["pl"].astype(str).str.strip() == "").sum()))
print("SCENES:  ", len(df_scen), "rows | brak PL:", int((df_scen["pl"].astype(str).str.strip() == "").sum()))

print("SUBJECTS duplicate keys:", int(df_subj["key"].duplicated().sum()))
print("SCENES duplicate keys:  ", int(df_scen["key"].duplicated().sum()))

display(df_subj.loc[df_subj["pl"].astype(str).str.strip() == "", ["key", "en", "pl"]].head(30))
display(df_scen.loc[df_scen["pl"].astype(str).str.strip() == "", ["key", "en", "pl"]].head(30))

# OPCJONALNIE: zapis próbny (nie nadpisuje oryginałów)
OUT_SUBJ = Path("/Users/olga/MetaLogic/data/_TEST_dictionary_subjects.normalized.csv")
OUT_SCEN = Path("/Users/olga/MetaLogic/data/_TEST_dictionary_scenes.normalized.csv")

df_subj.to_csv(OUT_SUBJ, index=False)
df_scen.to_csv(OUT_SCEN, index=False)

print("Wrote test files:")
print(" -", OUT_SUBJ)
print(" -", OUT_SCEN)

Unnamed: 0,en,pl,notes,active,updated_at,key,kind
0,aerosol_can,puszka aerozolowa,,True,,aerosol_can,subjects
1,air_conditioner,klimatyzator,,True,,air_conditioner,subjects
2,airplane,samolot,,True,,airplane,subjects
3,alarm_clock,budzik,,True,,alarm_clock,subjects
4,alcohol,alkohol,,True,,alcohol,subjects
5,alligator,aligator,,True,,alligator,subjects
6,almond,migdał,,True,,almond,subjects
7,ambulance,karetka,,True,,ambulance,subjects
8,amplifier,wzmacniacz,,True,,amplifier,subjects
9,anklet,bransoletka na kostkę,,True,,anklet,subjects


Unnamed: 0,en,pl,notes,active,updated_at,key,kind
0,airplane exterior,samolot,,True,,airplane_exterior,scenes
1,airport or train station,lotnisko lub dworzec,,True,,airport_or_train_station,scenes
2,apartment block area,blokowisko lub osiedle mieszkaniowe,,True,,apartment_block_area,scenes
3,bicycles or motorcycles,rowery lub motocykle,,True,,bicycles_or_motorcycles,scenes
4,bridges or viaducts,mosty lub wiadukty,,True,,bridges_or_viaducts,scenes
5,bus or tram exterior,autobus lub tramwaj,,True,,bus_or_tram_exterior,scenes
6,car exterior,auto,,True,,car_exterior,scenes
7,cigarettes or street smoking,papierosy lub palenie na ulicy,,True,,cigarettes_or_street_smoking,scenes
8,city park,park lub zieleń miejska,,True,,city_park,scenes
9,city square,plac miejski,,True,,city_square,scenes


SUBJECTS: 1204 rows | brak PL: 0
SCENES:   56 rows | brak PL: 0
SUBJECTS duplicate keys: 0
SCENES duplicate keys:   0


Unnamed: 0,key,en,pl


Unnamed: 0,key,en,pl


Wrote test files:
 - /Users/olga/MetaLogic/data/_TEST_dictionary_subjects.normalized.csv
 - /Users/olga/MetaLogic/data/_TEST_dictionary_scenes.normalized.csv


In [2]:
"""
Diagnostyka CSV: wykrycie separatora i podgląd problematycznej linii.
"""
from pathlib import Path
import csv

p = Path("/Users/olga/MetaLogic/data/dictionary_subjects.csv")

# 1) sniff separator
sample = p.read_text(encoding="utf-8", errors="replace")[:20000]
dialect = csv.Sniffer().sniff(sample, delimiters=[",",";","\t","|"])
print("Detected delimiter:", repr(dialect.delimiter))

# 2) pokaż linię wskazaną przez pandas (1-indexed; w błędzie: line 683)
bad_line_no = 683
lines = p.read_text(encoding="utf-8", errors="replace").splitlines()
print("Header:", lines[0])
print(f"Line {bad_line_no}:", lines[bad_line_no-1])

Detected delimiter: ';'
Header: subject_en;subject_pl
Line 683: measuring_stick;miarka (linijka, tyczka pomiarowa)


In [3]:
"""
Odporne wczytanie CSV: właściwy separator + python engine + pomijanie złych linii (do diagnozy).
Nic nie zapisuje.
"""
from pathlib import Path
import pandas as pd

p = Path("/Users/olga/MetaLogic/data/dictionary_subjects.csv")

# jeśli w Komórce 1 wyszło ';', wpisz sep=';'
df_raw = pd.read_csv(
    p,
    sep=";",              # <- zmień na wykryty delimiter: ',', ';', '\t', '|'
    engine="python",
    on_bad_lines="warn",  # pokaże ostrzeżenia i pominie linie do czasu naprawy
)

print("shape:", df_raw.shape)
print("columns:", df_raw.columns.tolist())
display(df_raw.head(10))

shape: (1204, 2)
columns: ['subject_en', 'subject_pl']


Unnamed: 0,subject_en,subject_pl
0,aerosol_can,puszka aerozolowa
1,air_conditioner,klimatyzator
2,airplane,samolot
3,alarm_clock,budzik
4,alcohol,alkohol
5,alligator,aligator
6,almond,migdał
7,ambulance,karetka
8,amplifier,wzmacniacz
9,anklet,bransoletka na kostkę
