In [None]:
from utils.loader_accessibility import download_dataset

dataset_ids = {
    "sport_629": 629,
    "sport_893": 893,
    "med_1258": 1258,
    "med_517": 517,
    "med_502": 502,
    "med_516": 516,
    "med_1260": 1260,
    "pharmacy": 2357,
    "transport_stops": 3221,
    "transport_routes": 62888,
    "transport_schedules": 758
}

for name, ds_id in dataset_ids.items():
    save_path = f"data/raw/{name}.csv"
    print(f"\n--- Загружаем {name} ---")
    df = download_dataset(ds_id, save_path=save_path)


In [None]:
import pandas as pd

df = pd.read_csv("data/raw/sport_629.csv")
print(type(df.loc[0, "geoData"]))
print(df.loc[0, "geoData"])


In [None]:
import pandas as pd

df = pd.read_csv("data/raw/sport_629.csv")
print(df.columns)
print(df.head(3).T)  # Транспонируем для удобства


In [1]:
from utils.preprocess import clean_and_convert

dataset_files = [
    "transport_stops"
]

for name in dataset_files:
    raw_path = f"data/raw/{name}.csv"
    out_path = f"data/processed/{name}.geojson"
    clean_and_convert(raw_path, out_path)

[INFO] Обработка: data/raw/transport_stops.csv
[DEBUG] Неизвестный формат координат: None
[DEBUG] Неизвестный формат координат: None
[INFO] Сохранено: data/processed/transport_stops.geojson (823 объектов)


In [None]:
import pandas as pd

df = pd.read_csv("data/raw/transport_stops.csv")
print(df.columns)
print(df.head(3).to_string())


In [4]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import os
import ast


def extract_coordinates(row):
    geo = row.get("geoData")
    if not isinstance(geo, dict):
        return None

    coords = geo.get("coordinates")
    if isinstance(coords, list) and len(coords) == 2:
        return Point(coords[0], coords[1])
    return None


def clean_and_convert(input_path, output_path):
    print(f"[INFO] Обработка: {input_path}")

    def safe_parse_geo(val):
        try:
            return ast.literal_eval(val)
        except Exception:
            return {}

    df = pd.read_csv(input_path)
    if "geoData" not in df.columns:
        print(f"[WARN] Нет поля geoData в {input_path}")
        return None

    df["geoData"] = df["geoData"].apply(safe_parse_geo)
    df["geometry"] = df.apply(extract_coordinates, axis=1)
    df = df[df["geometry"].notnull()]

    if df.empty:
        print(f"[WARN] Нет валидных координат в {input_path}")
        return None

    gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    gdf.to_file(output_path, driver="GeoJSON")
    print(f"[INFO] Сохранено: {output_path} ({len(gdf)} объектов)")
    return gdf


# --- Использование
if __name__ == "__main__":
    clean_and_convert("data/raw/transport_routes.csv", "data/processed/transport_routes.geojson")


[INFO] Обработка: data/raw/transport_routes.csv
[INFO] Сохранено: data/processed/transport_routes.geojson (139 объектов)


In [3]:
import pandas as pd

df = pd.read_csv("data/raw/transport_routes.csv")
print(df.columns)
df.head()


Index(['StationName', 'TransliterationStation', 'DiameterName', 'City',
       'District', 'Area', 'Tariff', 'Platforms', 'MaskStation', 'TransferMCD',
       'TransferAeroExpress', 'TransferMetroStation', 'AeroexpressStation',
       'RailwayStation', 'ExitTrainStations', 'WorkingHours', 'ObjectStatus',
       'global_id', 'geoData'],
      dtype='object')


Unnamed: 0,StationName,TransliterationStation,DiameterName,City,District,Area,Tariff,Platforms,MaskStation,TransferMCD,TransferAeroExpress,TransferMetroStation,AeroexpressStation,RailwayStation,ExitTrainStations,WorkingHours,ObjectStatus,global_id,geoData
0,Одинцово,Odintsovo,МЦД-1,Одинцовский городской округ,,,Пригород,3,да,,Шереметьево,[],"{'global_id': 1509001630, 'value': 'Одинцово'}",[],,"[{'is_deleted': 0, 'global_id': 64, 'DayOfWeek...",действует,1058815439,"{'coordinates': [37.281532, 55.672189], 'type'..."
1,Баковка,Bakovka,МЦД-1,Одинцовский городской округ,,,Пригород,2,нет,,Шереметьево,[],"{'global_id': 1509001278, 'value': 'Баковка'}",[],,"[{'is_deleted': 0, 'global_id': 106, 'DayOfWee...",действует,1058819388,"{'coordinates': [37.317476, 55.683022], 'type'..."
2,Сколково,Skolkovo,МЦД-1,Одинцовский городской округ,,,Пригород,3,нет,,Шереметьево,[],"{'global_id': 1509001201, 'value': 'Сколково'}",[],,"[{'is_deleted': 0, 'global_id': 120, 'DayOfWee...",действует,1058901090,"{'coordinates': [37.342054, 55.70023], 'type':..."
3,Немчиновка,Nemchinovka,МЦД-1,Одинцовский городской округ,,,Пригород,2,нет,,Шереметьево,[],"{'global_id': 1508999617, 'value': 'Немчиновка'}",[],,"[{'is_deleted': 0, 'global_id': 127, 'DayOfWee...",действует,1058901245,"{'coordinates': [37.375731, 55.716016], 'type'..."
4,Сетунь,Setun,МЦД-1,город Москва,Западный административный округ,Можайский район,Центральная,2,нет,,Шереметьево,[],"{'global_id': 1508997643, 'value': 'Сетунь'}","{'global_id': 1132155504, 'value': 'Сетунь'}",,"[{'is_deleted': 0, 'global_id': 141, 'DayOfWee...",действует,1058901441,"{'coordinates': [37.397316, 55.723642], 'type'..."


In [None]:
print(df.loc[0, "geoData"])
print(type(df.loc[0, "geoData"]))

In [None]:
from utils.preprocess import clean_transport_stops

clean_transport_stops("data/raw/transport_stops.csv", "data/processed/transport_stops.geojson")


In [None]:
import pandas as pd
import ast

df = pd.read_csv("data/raw/transport_stops.csv")
print(df.columns)

# Покажи первую непустую строку с geoData
for i, val in enumerate(df["geoData"]):
    if isinstance(val, str) and len(val) > 10:
        try:
            parsed = ast.literal_eval(val)
        except Exception as e:
            parsed = val
        print(f"\nСтрока {i}:\nИсходное значение:\n{val}\n\nПосле ast.literal_eval:\n{parsed}\nТип: {type(parsed)}")
        break


In [None]:
import pandas as pd

df = pd.read_csv("data/raw/sport_629.csv", converters={"geoData": eval})
print(df.loc[0, "geoData"])
print(type(df.loc[0, "geoData"]))


In [None]:
print("[INFO] Пример geoData:")
print(df["geoData"].dropna().head(3).to_list())

print("[INFO] geoData с координатами:")
has_coords = df["geoData"].apply(lambda g: isinstance(g, dict) and "coordinates" in g and g["coordinates"])
print(f"{has_coords.sum()} из {len(df)} записей имеют координаты")
