In [2]:
import os
import glob
import numpy as np
import h5py
from dataclasses import dataclass
import matplotlib.pyplot as plt
from typing import Dict, List, Optional, Tuple

In [3]:
S3DIS_CLASS_TO_ID: Dict[str, int] = {
    "ceiling": 0,
    "floor": 1,
    "wall": 2,
    "beam": 3,
    "column": 4,
    "window": 5,
    "door": 6,
    "table": 7,
    "chair": 8,
    "sofa": 9,
    "bookcase": 10,
    "board": 11,
    "clutter": 12,
}

S3DIS_ID_TO_CLASS: Dict[int, str] = {v: k for k, v in S3DIS_CLASS_TO_ID.items()}

In [4]:
@dataclass
class RoomBuildResult:
    dataset: np.ndarray            # [N, 7] float32: X Y Z R G B label
    xyz_mean: np.ndarray           # [3]
    xyz_scale: float               # scalar
    label_counts: Dict[int, int]   # {label_id: count}

In [5]:
def _safe_loadtxt(
    path: str,
    delimiter: Optional[str] = None,
    dtype: np.dtype = np.float32,
    max_skiprows: int = 20,
    min_cols: int = 6,
) -> np.ndarray:
    """
    Пытается прочитать numeric-таблицу через np.loadtxt.
    Если в файле есть битые строки/символы, делает fallback на построчное чтение
    и пропускает строки, которые нельзя распарсить в float.
    """
    last_err = None

    # 1) Быстрый путь: np.loadtxt со skiprows
    for skip in range(max_skiprows + 1):
        try:
            arr = np.loadtxt(path, delimiter=delimiter, dtype=dtype, skiprows=skip)
            if arr.ndim == 1:
                arr = arr.reshape(1, -1)
            if arr.size == 0:
                continue
            if arr.shape[1] < min_cols:
                continue
            return arr
        except Exception as e:
            last_err = e
            continue

    # 2) Fallback: построчно, пропуская битые строки
    rows = []
    target_cols = None

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for _ in range(max_skiprows):
            _ = f.readline()

        for line in f:
            line = line.strip()
            if not line:
                continue

            parts = line.split(delimiter) if delimiter else line.split()
            if len(parts) < min_cols:
                continue

            try:
                vals = [float(x) for x in parts]
            except ValueError:
                continue  # битая строка

            if target_cols is None:
                target_cols = len(vals)
                if target_cols < min_cols:
                    target_cols = min_cols

            if len(vals) < target_cols:
                continue
            rows.append(vals[:target_cols])

    if not rows:
        raise RuntimeError(f"Не удалось прочитать файл: {path}. Последняя ошибка: {last_err}")

    return np.asarray(rows, dtype=dtype)


In [6]:
def _infer_delimiter_from_line(line: str) -> Optional[str]:
    # В S3DIS обычно пробелы, но иногда бывает запятая/таб.
    if "," in line:
        return ","
    return None 

In [7]:
def load_points_file_any(path: str) -> np.ndarray:
    """
    Загружает файл точек (.ptx или .txt), пытаясь получить минимум XYZRGB.
    Возвращает массив [N, >=6] (float32). RGB ожидается в [0..255] (как в S3DIS).
    """
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        first_nonempty = ""
        for _ in range(50):
            s = f.readline()
            if not s:
                break
            s = s.strip()
            if s:
                first_nonempty = s
                break

    delim = _infer_delimiter_from_line(first_nonempty) if first_nonempty else None
    arr = _safe_loadtxt(path, delimiter=delim, dtype=np.float32, max_skiprows=30, min_cols=6)

    # Ожидаемые варианты:
    # - XYZRGB -> 6 колонок
    # - XYZRGB + extra -> >6
    if arr.shape[1] < 6:
        raise ValueError(f"В файле {path} меньше 6 колонок (ожидалось минимум XYZRGB). shape={arr.shape}")

    return arr

In [8]:
def parse_annotation_class_name(filename: str) -> Optional[str]:
    """
    В S3DIS: "chair_1.ptx", "wall_12.ptx", "clutter_3.ptx", ...
    Берём префикс до первого '_' как имя класса.
    """
    base = os.path.basename(filename)
    m = re.match(r"^([A-Za-z]+)\_", base)
    if not m:
        return None
    cls = m.group(1).lower()
    return cls if cls in S3DIS_CLASS_TO_ID else None

In [9]:
def normalize_xyz(xyz: np.ndarray, eps: float = 1e-6) -> Tuple[np.ndarray, np.ndarray, float]:
    """
    Нормализация XYZ:
    - центрирование (вычитание среднего)
    - масштабирование на max-норму (максимальная длина вектора)
    Возвращает (xyz_norm, mean, scale)
    """
    mean = xyz.mean(axis=0)
    centered = xyz - mean
    norms = np.linalg.norm(centered, axis=1)
    scale = float(np.max(norms)) if centered.shape[0] > 0 else 1.0
    if scale < eps:
        scale = 1.0
    xyz_norm = centered / scale
    return xyz_norm, mean, scale

In [10]:
def normalize_rgb(rgb: np.ndarray) -> np.ndarray:
    """
    RGB -> [0,1]. Если уже в [0,1], не ломаем.
    """
    rgb = rgb.astype(np.float32)
    if np.max(rgb) <= 1.0 + 1e-6:
        return np.clip(rgb, 0.0, 1.0)
    return np.clip(rgb / 255.0, 0.0, 1.0)

In [11]:
def build_room_dataset(
    room_dir: str,
    use_annotation: bool = True,
    normalize_coords: bool = True,
    normalize_colors: bool = True,
    allow_fallback_room_txt: bool = True,
) -> RoomBuildResult:
    """
    room_dir пример:
      .../Area_1/office_1

    Приоритет:
    1) Annotation/*.ptx: собираем точки по классам -> label
    2) Fallback: room .txt (например office_1.txt), но там обычно нет label.
       Если fallback используется, label ставим -1.
    """
    room_dir = os.path.abspath(room_dir)
    ann_dir = os.path.join(room_dir, "Annotation")
    datasets: List[np.ndarray] = []

    if use_annotation and os.path.isdir(ann_dir):
        ptx_files = sorted(glob.glob(os.path.join(ann_dir, "*.ptx")))
        for p in ptx_files:
            cls = parse_annotation_class_name(p)
            if cls is None:
                continue
            pts = load_points_file_any(p)  # [N, >=6]
            xyz = pts[:, 0:3].astype(np.float32)
            rgb = pts[:, 3:6].astype(np.float32)
            label = np.full((pts.shape[0], 1), S3DIS_CLASS_TO_ID[cls], dtype=np.float32)
            block = np.hstack([xyz, rgb, label]).astype(np.float32)  # [N,7]
            datasets.append(block)

    if len(datasets) == 0 and allow_fallback_room_txt:
        # Пробуем найти room.txt (обычно "<room_name>.txt")
        txt_files = sorted(glob.glob(os.path.join(room_dir, "*.txt")))
        if len(txt_files) == 0:
            raise FileNotFoundError(
                f"Не найдено ни Annotation/*.ptx, ни *.txt в комнате: {room_dir}"
            )
        # Берём самый вероятный (не из Annotation)
        room_txt = None
        for t in txt_files:
            if os.path.basename(os.path.dirname(t)) != "Annotation":
                room_txt = t
                break
        if room_txt is None:
            room_txt = txt_files[0]

        pts = load_points_file_any(room_txt)  # [N, >=6]
        xyz = pts[:, 0:3].astype(np.float32)
        rgb = pts[:, 3:6].astype(np.float32)
        label = np.full((pts.shape[0], 1), -1, dtype=np.float32)  # нет разметки
        datasets.append(np.hstack([xyz, rgb, label]).astype(np.float32))

    dataset = np.vstack(datasets).astype(np.float32)

    # Предобработка
    xyz_mean = np.zeros(3, dtype=np.float32)
    xyz_scale = 1.0

    if normalize_coords:
        xyz_norm, xyz_mean, xyz_scale = normalize_xyz(dataset[:, 0:3])
        dataset[:, 0:3] = xyz_norm

    if normalize_colors:
        dataset[:, 3:6] = normalize_rgb(dataset[:, 3:6])

    # Подсчёт меток
    labels_int = dataset[:, 6].astype(np.int32)
    uniq, cnt = np.unique(labels_int, return_counts=True)
    label_counts = {int(u): int(c) for u, c in zip(uniq, cnt)}

    return RoomBuildResult(
        dataset=dataset,
        xyz_mean=xyz_mean.astype(np.float32),
        xyz_scale=float(xyz_scale),
        label_counts=label_counts,
    )

In [12]:
def save_npy(path: str, dataset: np.ndarray) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    np.save(path, dataset)

In [13]:
def save_txt(path: str, dataset: np.ndarray) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    header = "X Y Z R G B label"
    np.savetxt(path, dataset, fmt="%.6f", header=header, comments="")

In [14]:
def save_h5(path: str, dataset: np.ndarray, compression: str = "gzip") -> None:

    os.makedirs(os.path.dirname(path), exist_ok=True)
    with h5py.File(path, "w") as f:
        f.create_dataset("dataset", data=dataset, compression=compression)
        # Удобно дополнительно хранить X и y
        f.create_dataset("X", data=dataset[:, 0:6], compression=compression)
        f.create_dataset("y", data=dataset[:, 6].astype(np.int32), compression=compression)


In [15]:
def plot_label_histogram(
    dataset: np.ndarray,
    title: str = "Распределение меток",
    save_path: Optional[str] = None,
    show: bool = True,
) -> None:
    import matplotlib.pyplot as plt

    labels = dataset[:, 6].astype(np.int32)
    uniq, cnt = np.unique(labels, return_counts=True)

    plt.figure()
    plt.bar(uniq, cnt)
    plt.xlabel("label id")
    plt.ylabel("count")
    plt.title(title)

    # подписи классов, если метки валидные
    xt = []
    for u in uniq:
        if u in S3DIS_ID_TO_CLASS:
            xt.append(f"{u}\n{S3DIS_ID_TO_CLASS[u]}")
        else:
            xt.append(str(u))
    plt.xticks(uniq, xt, rotation=0)

    if save_path is not None:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        plt.savefig(save_path, dpi=150, bbox_inches="tight")

    if show:
        plt.show()
    else:
        plt.close()


In [16]:
def find_rooms_in_s3dis_root(s3dis_root: str) -> List[str]:
    """
    Ищем комнаты по шаблону:
      s3dis_root/Area_*/<room_name>/
    """
    s3dis_root = os.path.abspath(s3dis_root)
    rooms = []
    for area in sorted(glob.glob(os.path.join(s3dis_root, "Area_*"))):
        if not os.path.isdir(area):
            continue
        for room in sorted(glob.glob(os.path.join(area, "*"))):
            if os.path.isdir(room):
                rooms.append(room)
    return rooms


In [17]:
def build_and_save_all_rooms(
    s3dis_root: str,
    out_root: str,
    use_annotation: bool = True,
    normalize_coords: bool = True,
    normalize_colors: bool = True,
    save_formats: Tuple[str, ...] = ("npy", "txt", "h5"),
    save_hist_png: bool = True,
    show_hist: bool = False,
) -> None:
    rooms = find_rooms_in_s3dis_root(s3dis_root)
    if len(rooms) == 0:
        raise FileNotFoundError(f"Не найдено комнат в корне S3DIS: {s3dis_root}")

    out_root = os.path.abspath(out_root)
    os.makedirs(out_root, exist_ok=True)

    for room_dir in rooms:
        area = os.path.basename(os.path.dirname(room_dir))
        room = os.path.basename(room_dir)
        key = f"{area}__{room}"

        res = build_room_dataset(
            room_dir=room_dir,
            use_annotation=use_annotation,
            normalize_coords=normalize_coords,
            normalize_colors=normalize_colors,
            allow_fallback_room_txt=True,
        )

        base = os.path.join(out_root, key)
        if "npy" in save_formats:
            save_npy(base + ".npy", res.dataset)
        if "txt" in save_formats:
            save_txt(base + ".txt", res.dataset)
        if "h5" in save_formats:
            save_h5(base + ".h5", res.dataset)

        if save_hist_png:
            plot_label_histogram(
                res.dataset,
                title=f"Label distribution: {key}",
                save_path=base + "_labels.png",
                show=show_hist,
            )

        # Мини-лог в консоль + первые 5 строк
        print(f"[OK] {key}: dataset shape = {res.dataset.shape}, labels = {res.label_counts}")
        print(res.dataset[:5])

In [18]:
s3dis_root = "./Stanford3dDataset_v1.2_Aligned_Version"
out_dir = "./task2_output"
os.makedirs(out_dir, exist_ok=True)

build_and_save_all_rooms(
    s3dis_root=s3dis_root,
    out_root=out_dir,
    use_annotation=True,
    normalize_coords=True,
    normalize_colors=True,
    save_formats=,
    save_hist_png=True,
    show_hist=False,
)

[OK] Area_1__WC_1: dataset shape = (1112902, 7), labels = {-1: 1112902}
[[ 0.6405453  -0.15620343  0.32196108  0.9607843   0.9607843   0.99215686
  -1.        ]
 [ 0.61642057 -0.17035371  0.32196108  0.9529412   0.9647059   0.9843137
  -1.        ]
 [ 0.6370659  -0.16687432  0.32172912  0.9647059   0.9764706   1.
  -1.        ]
 [ 0.6359058  -0.17592055  0.32126516  0.9647059   0.96862745  0.9843137
  -1.        ]
 [ 0.61966807 -0.17035371  0.32196108  0.94509804  0.9647059   0.98039216
  -1.        ]]
[OK] Area_1__conferenceRoom_1: dataset shape = (1136617, 7), labels = {-1: 1136617}
[[ 0.5748412   0.17337881  0.15701628  0.2784314   0.2509804   0.21176471
  -1.        ]
 [ 0.56844836  0.17670329  0.15292476  0.26666668  0.2509804   0.20392157
  -1.        ]
 [ 0.571517    0.17568     0.15215759  0.27450982  0.23921569  0.20392157
  -1.        ]
 [ 0.5717726   0.1746567   0.15727197  0.28235295  0.25490198  0.21568628
  -1.        ]
 [ 0.57560843  0.17337881  0.1562491   0.2784314   0