In [1]:
# CELL 1: imports, config, required schema
import os, sys, csv, json, textwrap, math, gc
import pandas as pd
import numpy as np

# Robust CSV reading: auto-detect delimiter
def read_csv_auto(path, nrows=None):
    return pd.read_csv(path, sep=None, engine='python', nrows=nrows)

# Four labeled files
FILES = {
    'sf_after_renovation.csv': 'AFTER_RENOVATION',
    'sf_developer_state.csv' : 'DEVELOPER_STATE',
    'sf_for_renovation.csv'  : 'FOR_RENOVATION',
    'sf_good.csv'            : 'GOOD',
}

# Required schema per the training/inference pipeline
REQUIRED_TEXT = ['Description']
REQUIRED_NUM  = ['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors']
REQUIRED_CAT  = ['BuildingType', 'OfferFrom', 'TypeOfMarket']
REQUIRED_DT   = ['BuiltYear']  # used to derive 'year'
REQUIRED_ALL  = REQUIRED_TEXT + REQUIRED_NUM + REQUIRED_CAT + REQUIRED_DT

print("Required columns:", REQUIRED_ALL)


Required columns: ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']


In [2]:
# CELL 2 (robust loader): load and normalize column names with resilient CSV parsing
import os, io, csv
import pandas as pd

try:
    import chardet
except ImportError:
    chardet = None

# Prefer semicolon first (the notebook writes CSV with sep=';'), then sniff/fallbacks
PREFERRED_SEPARATORS = [';', ',', '\t', '|']

def detect_encoding(path, sample_bytes=512_000):
    if chardet is None:
        # Common encodings for PL datasets
        for enc in ['utf-8-sig', 'utf-8', 'cp1250', 'latin1']:
            try:
                with open(path, 'rb') as f:
                    f.read(64)
                return enc
            except Exception:
                continue
        return 'utf-8'
    with open(path, 'rb') as f:
        raw = f.read(sample_bytes)
    res = chardet.detect(raw) or {}
    return (res.get('encoding') or 'utf-8')

def sniff_delimiter(path, encoding, sample_lines=500):
    from itertools import islice
    with open(path, 'r', encoding=encoding, errors='replace', newline='') as f:
        sample = ''.join(list(islice(f, sample_lines)))
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=',;\t|')
        return dialect.delimiter
    except Exception:
        counts = {sep: sample.count(sep) for sep in [',',';','\t','|']}
        return max(counts, key=counts.get)

def try_read(path, sep, encoding, quoting_mode=None, force_quote_none=False):
    kw = dict(sep=sep, engine='python', encoding=encoding, on_bad_lines='skip',
              quotechar='"', doublequote=True)
    if force_quote_none:
        kw.update(dict(quotechar='\x01', quoting=csv.QUOTE_NONE, escapechar='\\'))
    if quoting_mode is not None:
        kw.update(dict(quoting=quoting_mode))
    return pd.read_csv(path, **kw)

def read_csv_robust(path):
    enc_candidates = [detect_encoding(path)] + ['utf-8-sig','utf-8','cp1250','latin1']
    seen = set(); enc_candidates = [e for e in enc_candidates if not (e in seen or seen.add(e))]

    # 1) Prefer semicolon, then sniffed, then list
    sep_candidates = PREFERRED_SEPARATORS.copy()
    try:
        sniffed = sniff_delimiter(path, enc_candidates)
        if sniffed not in sep_candidates:
            sep_candidates.insert(1, sniffed)
    except Exception:
        pass

    last_err = None
    for enc in enc_candidates:
        for sep in sep_candidates:
            for mode in [None, csv.QUOTE_MINIMAL]:
                try:
                    return try_read(path, sep=sep, encoding=enc, quoting_mode=mode), (enc, sep, mode, False)
                except Exception as e:
                    last_err = e
            try:
                return try_read(path, sep=sep, encoding=enc, force_quote_none=True), (enc, sep, 'QUOTE_NONE+escape', True)
            except Exception as e:
                last_err = e
                continue
    raise last_err if last_err else RuntimeError('Unable to read CSV')

# Load all four files
dfs = {}
for path, label in FILES.items():
    if not os.path.exists(path):
        print(f"[MISSING] {path}")
        continue
    try:
        df, used = read_csv_robust(path)
        enc, sep, quoting_used, forced = used
        df.columns = [c.strip() for c in df.columns]
        dfs[path] = df
        print(f"[OK] {path}: shape={df.shape} | encoding={enc} | sep={repr(sep)} | quoting={quoting_used} | forced_quote_none={forced}")
        print("     columns(sample):", df.columns[:12].tolist())
    except Exception as e:
        print(f"[FAIL] {path}: {type(e).__name__}: {e}")

assert len(dfs) == len(FILES), "Some input files failed to load"


[OK] sf_after_renovation.csv: shape=(20384, 1) | encoding=utf-8-sig | sep=';' | quoting=QUOTE_NONE+escape | forced_quote_none=True
     columns(sample): ['"5376217","NULL","22","Ochota | Mołdawska – mieszkanie z widokiem na panoramę Warszawy","Na ulicy Mołdawskiej, w sercu Ochoty, znajduje się mieszkanie z widokiem na panoramę Warszawy. To przestrzeń, w której codzienność łączy wygodę życia w spokojnym otoczeniu z szybkim dostępem do centrum. Kameralna atmosfera osiedla i dobrze rozwinięta infrastruktura sprawiają, że to miejsce spełnia oczekiwania zarówno rodzin, jak i osób aktywnie korzystających z uroków miasta. Wnętrze z potencjałem Na 49 m² rozplanowano funkcjonalnie trzy pokoje: jasny salon (17 m²) z dużym oknem i pięknym widokiem, sypialnię (12 m²), przytulny trzeci pokój (6,5 m²) – idealny na gabinet lub pokój dziecka, osobną widną kuchnię (5 m²) oraz łazienkę z WC (3 m²). Przemyślany układ uzupełnia przedpokój (5,5 m²) z miejscem na zabudowę. Mieszkanie jest po generalnym remo

In [6]:
# CELL 2a: raw preview of files + delimiter frequency
import os
from collections import Counter

CAND_SEPS = [',',';','\t','|','^','~']

def raw_head(path, encoding='utf-8', n=10):
    lines=[]
    with open(path, 'r', encoding=encoding, errors='replace', newline='') as f:
        for i, line in enumerate(f, 1):
            lines.append((i, line.rstrip('\n')))
            if i>=n: break
    return lines

def guess_encoding(path):
    try:
        import chardet
        with open(path,'rb') as f:
            data=f.read(512000)
        return (chardet.detect(data) or {}).get('encoding') or 'utf-8'
    except Exception:
        return 'utf-8'

def sep_stats(lines):
    stats=[]
    for s in CAND_SEPS:
        stats.append((s, sum(l.count(s) for _,l in lines)))
    return sorted(stats, key=lambda x: x[1], reverse=True)

for path in FILES.keys():
    if not os.path.exists(path):
        print(f"[MISSING] {path}")
        continue
    enc = guess_encoding(path)
    lines = raw_head(path, encoding=enc, n=10)
    print(f"\n=== {path} | encoding≈{enc} ===")
    for i, l in lines:
        print(f"{i:03d}: {l!r}")
    print("Separator counts (top):", sep_stats(lines)[:5])



=== sf_after_renovation.csv | encoding≈Windows-1254 ===
001: '"5376217","NULL","22","Ochota | MoÅ‚dawska â€“ mieszkanie z widokiem na panoramÄ™ Warszawy","Na ulicy MoÅ‚dawskiej, w sercu Ochoty, znajduje siÄ™ mieszkanie z widokiem na panoramÄ™ Warszawy. To przestrzeÅ„, w ktÃ³rej codziennoÅ›Ä‡ Å‚Ä…czy wygodÄ™ Å¼ycia w spokojnym otoczeniu z szybkim dostÄ™pem do centrum. Kameralna atmosfera osiedla i dobrze rozwiniÄ™ta infrastruktura sprawiajÄ…, Å¼e to miejsce speÅ‚nia oczekiwania zarÃ³wno rodzin, jak i osÃ³b aktywnie korzystajÄ…cych z urokÃ³w miasta. WnÄ™trze z potencjaÅ‚em Na 49 mÂ² rozplanowano funkcjonalnie trzy pokoje: jasny salon (17 mÂ²) z duÅ¼ym oknem i piÄ™knym widokiem, sypialniÄ™ (12 mÂ²), przytulny trzeci pokÃ³j (6,5 mÂ²) â€“ idealny na gabinet lub pokÃ³j dziecka, osobnÄ… widnÄ… kuchniÄ™ (5 mÂ²) oraz Å‚azienkÄ™ z WC (3 mÂ²). PrzemyÅ›lany ukÅ‚ad uzupeÅ‚nia przedpokÃ³j (5,5 mÂ²) z miejscem na zabudowÄ™. Mieszkanie jest po generalnym remoncie (lipiec 2021) â€“ od tamtej pory nikt

In [7]:
# CELL 2b: auto-pick header line, re-read with inferred header and sep
import csv, re
import pandas as pd
from itertools import islice

def score_header_line(line, sep):
    # więcej tokenów alfanumerycznych i krótkie tokeny => bardziej jak nagłówki
    toks = [t.strip() for t in line.split(sep)]
    alnum = sum(bool(re.search(r'[A-Za-zĄąĆćĘęŁłŃńÓóŚśŹźŻż]', t)) for t in toks)
    uniq = len(set(toks))
    short = sum(len(t)<=30 for t in toks)
    return alnum*2 + uniq + short*0.5

def infer_sep_by_counts(line):
    counts = {s: line.count(s) for s in [',',';','\t','|','^','~']}
    return max(counts, key=counts.get)

def find_header(path, encoding, scan_lines=200):
    with open(path, 'r', encoding=encoding, errors='replace', newline='') as f:
        lines = list(islice(f, scan_lines))
    # infer sep from the densest line among first 5
    probe = ''.join(lines[:5])
    sep = infer_sep_by_counts(probe)
    best_i, best_score = 0, -1
    for i, ln in enumerate(lines[:50]):  # zwykle w nagłówku
        sc = score_header_line(ln.rstrip('\n'), sep)
        if sc>best_score:
            best_score, best_i = sc, i
    header_line = lines[best_i].rstrip('\n')
    header_tokens = [t.strip() for t in header_line.split(sep)]
    return sep, best_i, header_tokens

def read_with_header(path):
    enc = guess_encoding(path)
    sep, hdr_idx, names = find_header(path, enc)
    print(f"{path}: sep={repr(sep)} | header_line={hdr_idx+1} | columns_detected={names}")
    # read all, then drop rows up to header line and set names
    df_raw = pd.read_csv(path, sep=sep, engine='python', header=None, encoding=enc, on_bad_lines='skip')
    df = df_raw.iloc[hdr_idx+1:].reset_index(drop=True).copy()
    df.columns = [str(c).strip() for c in names]
    return df, {'encoding':enc, 'sep':sep, 'header_line':hdr_idx+1, 'columns':names}

dfs2 = {}
meta2 = {}
for path in FILES.keys():
    if os.path.exists(path):
        try:
            df, meta = read_with_header(path)
            dfs2[path] = df
            meta2[path] = meta
            print(f"[OK] {path}: shape={df.shape}")
            print("     columns(sample):", df.columns[:20].tolist())
        except Exception as e:
            print(f"[FAIL] {path}: {e}")

# If success, replace dfs by dfs2 for next steps
if dfs2 and len(dfs2)==len(FILES):
    dfs = dfs2


sf_after_renovation.csv: sep=',' | header_line=48 | columns_detected=['"5375146"', '"NULL"', '"11"', '"Mieszkanie', 'KrakÃ³w', 'PodgÃ³rze Duchackie', 'Wola Duchacka', '48 mÂ²"', '"Mieszkanie', 'KrakÃ³w', 'PodgÃ³rze Duchackie', 'Wola Duchacka', '48 mÂ² OgÅ‚oszeniodawca: Bracia Sadurscy NieruchomoÅ›ci Kontakt: Julia Tarkowska Informacje szczegÃ³Å‚owe Forma wÅ‚asnoÅ›ci: OdrÄ™bna wÅ‚asnoÅ›Ä‡ lokalu Typ kuchni: aneks kuchenny - poÅ‚Ä…czony z salonem Stan nieruchomoÅ›ci: wysoki standard Stolarka okienna: PCVDodatkowe koszty: 600 TytuÅ‚ ogÅ‚oszenia: 2-pok./wysok standard/2022/podwÃ³jny garaÅ¼/ Szpakowa Nr oferty ogÅ‚oszeniodawcy: BS1- MS-311082 Ulica: Szpakowa Powierzchnia balkonu: 6.00 mÂ² Powierzchnia uÅ¼ytkowa: 47.83 mÂ² PiÄ™tro: 8 Liczba piÄ™ter: 11 Pokoje: 2 Liczba Å‚azienek: 1 Liczba sypialni: 1 Czy Å‚azienka z WC: Razem Typ rynku: WtÃ³rny Budynek Rok budowy: 2022 MateriaÅ‚ budowlany: zrÃ³Å¼nicowany Udogodnieniabalkoninternetklimatyzacja | Sprzedam 2-pok. mieszkanie z 2022r. / meble wyk

In [8]:
# CELL 3: schema normalization (auto-rename) and validation
import re, difflib
import pandas as pd

# Expected schema from the notebook pipeline
REQUIRED_TEXT = ['Description']
REQUIRED_NUM  = ['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors']
REQUIRED_CAT  = ['BuildingType', 'OfferFrom', 'TypeOfMarket']
REQUIRED_DT   = ['BuiltYear']
REQUIRED_ALL  = REQUIRED_TEXT + REQUIRED_NUM + REQUIRED_CAT + REQUIRED_DT

# Synonyms dictionary (lowercased, whitespace/underscore-insensitive match)
COLUMN_SYNONYMS = {
    'Description': ['description','opis','opis_oferty','opisoferty','desc','full_description','opis_pelny','tytul_opisu','title_description'],
    'Area': ['area','powierzchnia','metraz','metraż','pow','m2','sqm','surface','powierzchnia_m2'],
    'Price': ['price','cena','kwota','amount','price_pln','cena_pln','wartosc','wartość'],
    'NumberOfRooms': ['numberofrooms','rooms','liczbapokoi','pokoje','pokoi','iloscpokoi','rooms_count','roomsnumber','roomsno'],
    'Floor': ['floor','piętro','pietro','poziom','floor_no','floornumber'],
    'Floors': ['floors','liczbapieter','liczba_pieter','kondygnacje','kondygnacji','pieter','pietra_total','total_floors','floors_total'],
    'BuildingType': ['buildingtype','typbudynku','rodzajbudynku','typ_budynku','type_building','building_type'],
    'OfferFrom': ['offerfrom','ofertaod','zrodlo','źródło','source','sprzedajacy','sprzedający','agency','offer_from'],
    'TypeOfMarket': ['typeofmarket','rynek','market','typ_rynku','market_type'],
    'BuiltYear': ['builtyear','rokbudowy','rok_budowy','year_built','rok','data_budowy','rok_powstania']
}

def normalize_token(s: str) -> str:
    return re.sub(r'[^a-z0-9]+','', str(s).strip().lower())

def build_normal_index(cols):
    idx = {}
    for c in cols:
        idx[normalize_token(c)] = c
    return idx

def auto_map_columns(df: pd.DataFrame):
    original_cols = list(df.columns)
    norm_index = build_normal_index(original_cols)
    rename_map = {}
    report = []

    for target in REQUIRED_ALL:
        # 1) exact normalized name
        key = normalize_token(target)
        if key in norm_index:
            rename_map[norm_index[key]] = target
            report.append((target, norm_index[key], 'exact'))
            continue
        # 2) synonyms
        matched = False
        for syn in COLUMN_SYNONYMS.get(target, []):
            syn_key = normalize_token(syn)
            if syn_key in norm_index:
                rename_map[norm_index[syn_key]] = target
                report.append((target, norm_index[syn_key], 'synonym'))
                matched = True
                break
        if matched:
            continue
        # 3) fuzzy close match among available columns
        candidates = difflib.get_close_matches(target.lower(), [c.lower() for c in original_cols], n=1, cutoff=0.88)
        if candidates:
            # pick the first candidate that is not already mapped
            cand = candidates
            # find original case column
            orig = next(c for c in original_cols if c.lower()==cand)
            if orig not in rename_map:
                rename_map[orig] = target
                report.append((target, orig, 'fuzzy'))
                continue
        # 4) leave as missing for now
        report.append((target, None, 'missing'))

    # Apply rename only for mapped items
    df2 = df.rename(columns=rename_map)
    return df2, report

# Apply auto mapping to each loaded DataFrame in dfs
mapping_reports = {}
for path, df in list(dfs.items()):
    df_mapped, rep = auto_map_columns(df)
    dfs[path] = df_mapped
    mapping_reports[path] = rep
    # Print mapping summary
    print(f"\n[{path}] column mapping summary:")
    for target, source, how in rep:
        print(f" - {target:<16} <- {source if source is not None else 'MISSING'} [{how}]")
    # Show unmapped required
    unmapped = [t for t,s,how in rep if s is None]
    if unmapped:
        print("Unmapped required columns:", unmapped)

# Re-run schema validation after mapping
problems = []
for path, df in dfs.items():
    cols = set(df.columns)
    missing = [c for c in REQUIRED_ALL if c not in cols]
    extra   = [c for c in cols if c not in REQUIRED_ALL]
    print(f"\n[{path}] post-mapping schema:")
    print(" - missing:", missing)
    print(" - extra  :", extra[:20], "(+ more)" if len(extra) > 20 else "")
    if missing:
        problems.append((path, "missing_columns", missing))

assert not problems, f"Schema issues after auto-mapping: {problems}"



[sf_after_renovation.csv] column mapping summary:
 - Description      <- MISSING [missing]
 - Area             <- MISSING [missing]
 - Price            <- MISSING [missing]
 - NumberOfRooms    <- MISSING [missing]
 - Floor            <- MISSING [missing]
 - Floors           <- MISSING [missing]
 - BuildingType     <- MISSING [missing]
 - OfferFrom        <- MISSING [missing]
 - TypeOfMarket     <- MISSING [missing]
 - BuiltYear        <- MISSING [missing]
Unmapped required columns: ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']

[sf_developer_state.csv] column mapping summary:
 - Description      <- MISSING [missing]
 - Area             <- MISSING [missing]
 - Price            <- MISSING [missing]
 - NumberOfRooms    <- MISSING [missing]
 - Floor            <- MISSING [missing]
 - Floors           <- MISSING [missing]
 - BuildingType     <- MISSING [missing]
 - OfferFrom        <- MISSING [missing]
 - Type

AssertionError: Schema issues after auto-mapping: [('sf_after_renovation.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_developer_state.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_for_renovation.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_good.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear'])]

In [9]:
# CELL 3b (optional): headerless CSV fallback per file with all-missing required columns
from copy import deepcopy

def headerless_fallback(path, df):
    # Heuristic: if po auto-mapping z komórki 3 mamy brak wszystkich REQUIRED_ALL,
    # spróbujmy ponownie wczytać plik z header=None i użyć pierwszego wiersza jako nagłówków.
    required_missing_all = all(c not in df.columns for c in REQUIRED_ALL)
    if not required_missing_all:
        return df, False

    print(f"[{path}] Trying header=None fallback ...")
    enc = 'utf-8-sig'
    try:
        # Użyj tych samych parametrów separacji, które wypisała komórka 2 (jeśli logowano),
        # w razie braku – spróbuj ';' i ',' po kolei.
        for sep in ['; ',',']:
            try:
                df0 = pd.read_csv(path, sep=sep, engine='python', header=None, on_bad_lines='skip', encoding=enc)
                # Pierwszy wiersz jako nagłówek
                new_cols = df0.iloc.astype(str).tolist()
                df1 = df0.iloc[1:].copy()
                df1.columns = [c.strip() for c in new_cols]
                df1, _ = auto_map_columns(df1)
                return df1, True
            except Exception:
                continue
    except Exception as e:
        print(f"[{path}] header=None fallback failed: {e}")
    return df, False

# Apply fallback only if absolutely necessary
reloaded = {}
for path, df in list(dfs.items()):
    df2, changed = headerless_fallback(path, df)
    dfs[path] = df2
    reloaded[path] = changed
    if changed:
        print(f"[{path}] Headerless fallback applied and columns remapped.")

# Final schema check
problems = []
for path, df in dfs.items():
    missing = [c for c in REQUIRED_ALL if c not in df.columns]
    print(f"[{path}] final missing columns:", missing)
    if missing:
        problems.append((path, "missing_columns", missing))
assert not problems, f"Schema issues remain after fallback: {problems}"


[sf_after_renovation.csv] Trying header=None fallback ...
[sf_developer_state.csv] Trying header=None fallback ...
[sf_for_renovation.csv] Trying header=None fallback ...
[sf_good.csv] Trying header=None fallback ...
[sf_after_renovation.csv] final missing columns: ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']
[sf_developer_state.csv] final missing columns: ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']
[sf_for_renovation.csv] final missing columns: ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']
[sf_good.csv] final missing columns: ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']


AssertionError: Schema issues remain after fallback: [('sf_after_renovation.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_developer_state.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_for_renovation.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_good.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear'])]

In [10]:
# CELL 3c: mapping suggestions + manual mapping hook
import difflib

REQUIRED_TEXT = ['Description']
REQUIRED_NUM  = ['Area','Price','NumberOfRooms','Floor','Floors']
REQUIRED_CAT  = ['BuildingType','OfferFrom','TypeOfMarket']
REQUIRED_DT   = ['BuiltYear']
REQUIRED_ALL  = REQUIRED_TEXT + REQUIRED_NUM + REQUIRED_CAT + REQUIRED_DT

for path, df in dfs.items():
    cols = list(df.columns)
    print(f"\n[{path}] mapping suggestions:")
    for target in REQUIRED_ALL:
        cands = difflib.get_close_matches(target.lower(), [c.lower() for c in cols], n=5, cutoff=0.6)
        # odzyskaj oryginalną wielkość liter
        cands_orig = []
        for c in cands:
            cands_orig.append(next(cc for cc in cols if cc.lower()==c))
        print(f" - {target:<16}: {cands_orig}")

# Manual mapping to fill if needed: {'existing_col_name':'ExpectedName'}
manual_map = {
    # 'opis':'Description',
    # 'powierzchnia':'Area',
    # 'cena':'Price',
    # 'pokoje':'NumberOfRooms',
    # 'pietro':'Floor',
    # 'liczba_pieter':'Floors',
    # 'typ_budynku':'BuildingType',
    # 'zrodlo':'OfferFrom',
    # 'rynek':'TypeOfMarket',
    # 'rok_budowy':'BuiltYear',
}

for path in list(dfs.keys()):
    if manual_map:
        dfs[path] = dfs[path].rename(columns=manual_map)

# Validate after manual map
problems = []
for path, df in dfs.items():
    missing = [c for c in REQUIRED_ALL if c not in df.columns]
    print(f"[{path}] missing after manual map:", missing)
    if missing:
        problems.append((path, 'missing_columns', missing))
assert not problems, f"Still missing required columns. Please extend manual_map. Details: {problems}"



[sf_after_renovation.csv] mapping suggestions:
 - Description     : []
 - Area            : []
 - Price           : []
 - NumberOfRooms   : []
 - Floor           : []
 - Floors          : []
 - BuildingType    : []
 - OfferFrom       : []
 - TypeOfMarket    : []
 - BuiltYear       : []

[sf_developer_state.csv] mapping suggestions:
 - Description     : []
 - Area            : []
 - Price           : []
 - NumberOfRooms   : []
 - Floor           : []
 - Floors          : []
 - BuildingType    : []
 - OfferFrom       : []
 - TypeOfMarket    : []
 - BuiltYear       : []

[sf_for_renovation.csv] mapping suggestions:
 - Description     : []
 - Area            : []
 - Price           : []
 - NumberOfRooms   : []
 - Floor           : []
 - Floors          : []
 - BuildingType    : []
 - OfferFrom       : []
 - TypeOfMarket    : []
 - BuiltYear       : []

[sf_good.csv] mapping suggestions:
 - Description     : []
 - Area            : []
 - Price           : []
 - NumberOfRooms   : []
 - Floo

AssertionError: Still missing required columns. Please extend manual_map. Details: [('sf_after_renovation.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_developer_state.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_for_renovation.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear']), ('sf_good.csv', 'missing_columns', ['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear'])]

In [11]:
# CELL 3d (optional): expand JSON if files contain JSON records
import json
from pandas import json_normalize

def try_expand_json(df):
    expanded = None
    for col in df.columns:
        sample = df[col].dropna().astype(str).head(50).tolist()
        # heur: jeśli większość niepustych wierszy zaczyna się znakiem { lub [, mamy JSON
        if sample and sum(s.strip().startswith(('{','[')) for s in sample) >= max(5, int(0.6*len(sample))):
            try:
                objs = []
                for s in df[col].fillna('').astype(str):
                    s = s.strip()
                    if not s:
                        objs.append({})
                        continue
                    try:
                        objs.append(json.loads(s))
                    except Exception:
                        objs.append({})
                exp = json_normalize(objs)
                # Dołącz prefiks kolumny źródłowej, żeby uniknąć konfliktów
                exp.columns = [f"{col}.{c}" for c in exp.columns]
                other = df.drop(columns=[col])
                expanded = pd.concat([other.reset_index(drop=True), exp.reset_index(drop=True)], axis=1)
                return expanded
            except Exception:
                continue
    return expanded

changed = False
for path in list(dfs.keys()):
    df = dfs[path]
    if df.shape[1] <= 3:  # podejrzanie mało kolumn jak na CSV tabelaryczne
        df2 = try_expand_json(df)
        if df2 is not None and df2.shape[1] > df.shape[1]:
            dfs[path] = df2
            changed = True
            print(f"[{path}] Expanded JSON-like column(s): new shape={df2.shape}")

if changed:
    # Po rozwinięciu uruchom ponownie CELL 3c, aby zmapować kolumny.
    print("Run CELL 3c again to map expanded columns to required schema.")


In [12]:
# CELL 4: missingness and text/date sanity
for path, df in dfs.items():
    print(f"\n[{path}] Missingness and sanity")
    sub = df[REQUIRED_ALL].copy()

    # Missingness
    miss_pct = sub.isna().mean().sort_values(ascending=False) * 100
    print("Missing % (top 15):")
    print(miss_pct.head(15).round(2))

    # Description emptiness
    desc = sub['Description'].fillna('')
    empty_ratio = (desc.str.strip() == '').mean() * 100
    print(f"Empty Description: {empty_ratio:.2f}%")

    # BuiltYear parse success
    by = pd.to_datetime(sub['BuiltYear'], errors='coerce')
    parse_ok = by.notna().mean() * 100
    year_min, year_max = by.dt.year.min(), by.dt.year.max()
    print(f"BuiltYear parse OK: {parse_ok:.2f}% | year_range=({year_min}, {year_max})")



[sf_after_renovation.csv] Missingness and sanity


KeyError: "None of [Index(['Description', 'Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors',\n       'BuildingType', 'OfferFrom', 'TypeOfMarket', 'BuiltYear'],\n      dtype='object')] are in the [columns]"

In [13]:
# CELL 5: numeric distributions and plausible ranges
RANGES = {
    'Area': (5, 1000),            # m^2
    'Price': (1000, 1e9),         # PLN-like
    'NumberOfRooms': (0, 50),
    'Floor': (-3, 200),
    'Floors': (1, 200),
}
for path, df in dfs.items():
    print(f"\n[{path}] Numeric stats and range checks")
    num = df[REQUIRED_NUM].apply(pd.to_numeric, errors='coerce')
    print(num.describe(percentiles=[.01,.05,.5,.95,.99]).T)

    for col, (lo, hi) in RANGES.items():
        bad = num[col].notna() & ((num[col] < lo) | (num[col] > hi))
        print(f"Out-of-range {col}: {bad.sum()} rows | expected [{lo},{hi}]")



[sf_after_renovation.csv] Numeric stats and range checks


KeyError: "None of [Index(['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors'], dtype='object')] are in the [columns]"

In [14]:
# CELL 6: categorical levels overview
for path, df in dfs.items():
    print(f"\n[{path}] Categorical levels")
    for col in REQUIRED_CAT:
        vc = df[col].astype(str).fillna('NA').str.strip().value_counts(dropna=False)
        print(f"- {col}: unique={vc.shape}")
        print(vc.head(20))



[sf_after_renovation.csv] Categorical levels


KeyError: 'BuildingType'

In [15]:
# CELL 7: text diagnostics
for path, df in dfs.items():
    print(f"\n[{path}] Text length diagnostics")
    desc = df['Description'].fillna('').astype(str)
    chars = desc.str.len()
    words = desc.str.split().map(len)
    print("Chars len: mean={:.1f}, p50={:.0f}, p90={:.0f}, p99={:.0f}".format(
        chars.mean(), chars.median(), chars.quantile(.9), chars.quantile(.99)))
    print("Words len: mean={:.1f}, p50={:.0f}, p90={:.0f}, p99={:.0f}".format(
        words.mean(), words.median(), words.quantile(.9), words.quantile(.99)))
    short = (words < 3).mean() * 100
    print(f"Very short (<3 words): {short:.2f}%")



[sf_after_renovation.csv] Text length diagnostics


KeyError: 'Description'

In [16]:
# CELL 8: merge labeled datasets
merged = []
for path, target in FILES.items():
    df = dfs[path].copy()
    df['State'] = target  # target labels aligned with notebook classes
    merged.append(df)
full = pd.concat(merged, ignore_index=True)
print("Merged shape:", full.shape)
print("Class counts:\n", full['State'].value_counts())


Merged shape: (79351, 5)
Class counts:
 State
AFTER_RENOVATION    20384
GOOD                19875
DEVELOPER_STATE     19829
FOR_RENOVATION      19263
Name: count, dtype: int64


In [17]:
# CELL 9: category harmonization across files
for col in REQUIRED_CAT:
    levels_per_file = {p: set(dfs[p][col].dropna().astype(str).str.strip().unique()) for p in dfs}
    union = set().union(*levels_per_file.values())
    print(f"\n{col}: union={len(union)}")
    only_in = {}
    for p, lv in levels_per_file.items():
        only = lv - set().union(*(levels_per_file[q] for q in levels_per_file if q != p))
        only_in[p] = only
    for p, only in only_in.items():
        if only:
            print(f" - only in {p}: {sorted(list(only))[:20]}")


KeyError: 'BuildingType'

In [18]:
# CELL 10: duplicates check
if 'SaleId' in full.columns:
    dup_id = full.duplicated(subset=['SaleId'], keep=False).sum()
    print(f"Duplicates by SaleId: {dup_id}")

keys = [c for c in ['Description','Price','Area'] if c in full.columns]
if keys:
    dup_keys = full.duplicated(subset=keys, keep=False).sum()
    print(f"Duplicates by {keys}: {dup_keys}")


In [19]:
# CELL 11: enforce dtypes and build 'year'
for col in REQUIRED_NUM:
    full[col] = pd.to_numeric(full[col], errors='coerce')
for col in REQUIRED_CAT:
    full[col] = full[col].astype(str).str.strip()

built_dt = pd.to_datetime(full['BuiltYear'], errors='coerce')
ok = built_dt.notna().mean() * 100
full['year'] = built_dt.dt.year
med = full['year'].median()
full['year'] = full['year'].fillna(med)
print(f"BuiltYear parse OK: {ok:.2f}% | filled NA in 'year' with median {med}")


KeyError: 'Area'

In [20]:
# CELL 12: dry-run preprocessing identical to notebook training
from sklearn.preprocessing import StandardScaler
num_features_train = ['Area','Price','NumberOfRooms','Floor','Floors','year']
cat_features_train = ['BuildingType','OfferFrom','TypeOfMarket']

scaler = StandardScaler()
X_num = scaler.fit_transform(full[num_features_train])

X_cat = pd.get_dummies(full[cat_features_train], dummy_na=False)
encoded_categorical_columns = X_cat.columns.tolist()

X_tabular = np.concatenate([X_num, X_cat.values], axis=1)
print("Tabular shapes -> num:", X_num.shape, "cat:", X_cat.shape, "concat:", X_tabular.shape)


KeyError: "None of [Index(['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'year'], dtype='object')] are in the [columns]"

In [21]:
# CELL 13: tokenizer dry-run (as in notebook)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words, max_len = 10000, 200
tok = Tokenizer(num_words=max_words, oov_token="<UNK>")
tok.fit_on_texts(full['Description'].fillna('').astype(str))

# Coverage estimate
total_tokens = sum(tok.word_counts.values())
top_words = [w for w,i in tok.word_index.items() if i <= max_words]
top_tokens = sum(tok.word_counts[w] for w in top_words if w in tok.word_counts)
coverage = 100.0 * top_tokens / max(1, total_tokens)
print(f"Vocab size={len(tok.word_index)} | coverage@{max_words}={coverage:.2f}%")

seqs = tok.texts_to_sequences(full['Description'].fillna('').astype(str))
lens = pd.Series([len(s) for s in seqs])
print("Seq len -> mean={:.1f}, p50={:.0f}, p90={:.0f}, p99={:.0f}, max={:.0f}".format(
    lens.mean(), lens.quantile(.5), lens.quantile(.9), lens.quantile(.99), lens.max()))
X_text = pad_sequences(seqs, maxlen=max_len)
print("Padded X_text shape:", X_text.shape)


KeyError: 'Description'

In [None]:
# CELL 14: readiness summary checks
checks = {
    "schema_ok": all(c in full.columns for c in REQUIRED_ALL + ['year']),
    "text_non_empty_%": (full['Description'].fillna('').str.strip() != '').mean() * 100,
    "built_year_parse_ok_%": pd.to_datetime(full['BuiltYear'], errors='coerce').notna().mean() * 100,
    "all_num_finite_%": np.isfinite(full[['Area','Price','NumberOfRooms','Floor','Floors','year']].astype(float).values).mean() * 100,
    "cat_cardinality": {c: int(full[c].nunique()) for c in ['BuildingType','OfferFrom','TypeOfMarket']},
    "class_counts": full['State'].value_counts().to_dict(),
    "tabular_shape": tuple(X_tabular.shape),
    "text_shape": tuple(X_text.shape),
}
print(json.dumps(checks, ensure_ascii=False, indent=2))
