In [None]:
import time

from sprint2 import transport_type

data_folder = 'data1/'
csv_path = data_folder + "PASS_ALL_202503242210.csv"  # путь к жирному файлу

parquet_path = csv_path.replace('.csv', '.parquet')
print(f"Последний запуск {time.ctime()}")


In [None]:
import sys
!{sys.executable} -m pip install pyarrow duckdb polars plotly jupyter_dash dash --upgrade
# зависимости
print("Проверка зависимостей")

import duckdb
import polars as pl
import plotly
import jupyter_dash

print("DuckDB OK:", duckdb.__version__)
print("Polars OK:", pl.__version__)
print("Plotly OK:", plotly.__version__)
print("Jupiter_dash OK:", jupyter_dash.__version__)


In [None]:
import duckdb
import polars as pl


def read_parquet_with_duckdb(data_path: str, no_metro=False, no_overground=False):
    where_clause = ""
    if no_overground:
        where_clause += " WHERE TRANSPORT_TYPE_ID = 1 "
    if no_metro:
        where_clause += " WHERE TRANSPORT_TYPE_ID > 1 "

    query = f"""
        SELECT *
        FROM (
            SELECT
                TRAN_DATE,
                DEVICE_NO,
                TRANSPORT_TYPE_ID,
                PLACE_ID,
                BUS_RT_NO,
                ROW_NUMBER() OVER () AS row_num
            FROM '{data_path}'
            {where_clause}
        ) AS sub
        WHERE row_num % 10 = 0
    """

    result = duckdb.query(query).pl().with_columns(
        pl.col("TRAN_DATE").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S%.f")
    ).with_columns([
        pl.col("TRAN_DATE").dt.date().alias("TRAN_ONLY_DATE"),
        pl.col("TRAN_DATE").dt.time().alias("TRAN_ONLY_TIME")
    ]).with_columns(
        pl.when(
            (pl.col("TRANSPORT_TYPE_ID") == 1) & (pl.col("BUS_RT_NO").is_null())
        )
        .then(-239)
        .otherwise(pl.col("BUS_RT_NO"))
        .alias("BUS_RT_NO")).with_columns(
        pl.col("TRAN_DATE").dt.strftime("%A").alias("DAY_NAME")
    )
    return result


big_data = read_parquet_with_duckdb(parquet_path)
big_data.sample()

In [None]:
def show(fig):
    fig.update_layout(
        coloraxis_showscale=False,
        updatemenus=[{
            "type": "buttons",
            "showactive": False,
            "buttons": [{
                "label": "Play",
                "method": "animate",
                "args": [None, {
                    "frame": {"duration": 2000, "redraw": True},  # 1000 мс = 1 секунда на кадр
                    "fromcurrent": True,
                    "transition": {"duration": 500, "easing": "linear"}  # плавность
                }]
            }, {
                "label": "Pause",
                "method": "animate",
                "args": [[None], {
                    "frame": {"duration": 0, "redraw": False},
                    "mode": "immediate",
                    "transition": {"duration": 0}
                }]
            }]
        }],
        sliders=[{
            "transition": {"duration": 500},
            "currentvalue": {"pref"
                             "ix": "День: "},
            "pad": {"t": 30},
            "len": 0.9
        }]
    )

    fig.show()

In [None]:
def filter_by_time(data, start_dt, end_dt, start_tm, end_tm, continuous):
    if continuous:
        filtered = data.filter(
            (pl.col("TRAN_DATE") >= datetime.strptime(start_dt + " " + start_tm, "%Y-%m-%d %H:%M")) &
            (pl.col("TRAN_DATE") <= datetime.strptime(end_dt + " " + end_tm, "%Y-%m-%d %H:%M"))
        )
    else:
        filtered = data.filter(
            (pl.col("TRAN_ONLY_DATE") >= datetime.strptime(start_dt, "%Y-%m-%d").date()) &
            (pl.col("TRAN_ONLY_DATE") <= datetime.strptime(end_dt, "%Y-%m-%d").date()) &
            (pl.col("TRAN_ONLY_TIME") >= datetime.strptime(start_tm, "%H:%M").time()) &
            (pl.col("TRAN_ONLY_TIME") <= datetime.strptime(end_tm, "%H:%M").time())
        )
    return filtered

In [None]:
overground = read_parquet_with_duckdb(parquet_path, no_metro=True)
underground = read_parquet_with_duckdb(parquet_path, no_overground=True)


In [103]:
from datetime import datetime


def load_top_stops_overground(top=20, start_dt="2025-03-10", end_dt="2025-03-16", start_tm="00:00",
                              end_tm="23:59", continuous=False):
    place_lookup = pl.read_csv("data1/REF_PSG_PLACES_202503251822.csv", separator=';')
    # Фильтрация по времени
    try:
        filtered = filter_by_time(overground, start_dt, end_dt, start_tm, end_tm, continuous)
    except:
        return "Wrong data format"
    # Топ остановок
    top_stops = (
        filtered.join(
            place_lookup.select(["PLACE_ID", "NAME", "LN_NAME_SHORT"]),
            left_on="PLACE_ID",
            right_on="PLACE_ID",
            how="left"
        )
        .group_by("NAME")
        .agg(pl.len().alias("count"))
        .sort("count", descending=[True])
        .drop_nulls()
        .with_columns(
            pl.col("count").rank("dense", descending=True).alias("rank")
        )
        .filter(pl.col("rank") <= top))

    return top_stops


load_top_stops_overground()


NAME,count,rank
str,u32,u32
"""Южный филиал (1 площадка-Домод…",437886,1
"""Юго-Западный филиал (1 площадк…",325174,2
"""Северо-западный филиал (1 площ…",315492,3
"""Центральный филиал (1 площадка…",169969,4
"""Северо-Восточный филиал (1 пло…",165215,5
…,…,…
"""Стартранс (Троицк) 6102""",62896,16
"""Центральный филиал (4 площадка…",61139,17
"""Трансавтолиз (Батюнинский)""",60729,18
"""Северо-Восточный филиал (4 пло…",59263,19


In [104]:
def load_top_stops_underground(top=20, start_dt="2025-03-10", end_dt="2025-03-16", start_tm="00:00",
                               end_tm="23:59", continuous=False):
    try:
        place_lookup = pl.read_csv(data_folder + "REF_PSG_PLACES_202503251822.csv", separator=';')
    except FileNotFoundError:
        return "No file to look up places: REF_PSG_PLACES_202503251822.csv"

    try:
        filtered = filter_by_time(underground, start_dt, end_dt, start_tm, end_tm, continuous)
    except:
        return "Wrong data format"
    top_stops = (
        filtered.join(
            place_lookup.select(["PLACE_ID", "NAME", "LN_NAME_SHORT"]),
            left_on="PLACE_ID",
            right_on="PLACE_ID",
            how="left"
        )
        .group_by("PLACE_ID", "NAME", "LN_NAME_SHORT")
        .agg(pl.len().alias("count"))
        .sort("count", descending=[True])
        .drop_nulls()
        .with_columns(
            pl.col("count").rank("dense", descending=True).alias("rank")
        )
        .filter(pl.col("rank") <= top))

    return top_stops


load_top_stops_underground()

PLACE_ID,NAME,LN_NAME_SHORT,count,rank
i64,str,str,u32,u32
1327,"""Щёлковская ( Северный )""","""Арбатско-Покровская""",39925,1
1294,"""Бауманская""","""Арбатско-Покровская""",39082,2
1459,"""Павелецкая КЛ""","""Кольцевая""",37759,3
1691,"""Текстильщики (Западный)""","""Таганско-Краснопресненская""",36699,4
1322,"""Славянский бульвар (Восток)""","""Арбатско-Покровская""",36539,5
…,…,…,…,…
1453,"""Комсомольская К (подземный зал…","""Кольцевая""",25679,16
1561,"""Бульвар Дмитрия Донского (Южны…","""Серпуховско-Тимирязевская""",25370,17
1299,"""Киевская АПЛ""","""Арбатско-Покровская""",24934,18
38954,"""Некрасовка (Первый)""","""Некрасовская""",24920,19


In [105]:
def load_top_routes_overground(top=20, start_dt="2025-03-10", end_dt="2025-03-16", start_tm="00:00",
                               end_tm="23:59", continuous=False, popularity=False):
    route_lookup = pl.read_csv(data_folder + "REF_TRANSPORT_WAY_202503251803.csv", separator=';')
    transport_type_lookup = pl.read_csv(data_folder+"REF_TRANSPORT_TYPE_202503251727.csv", separator=';')

    filtered = filter_by_time(overground, start_dt, end_dt, start_tm, end_tm, continuous)
    top_routes = (
        filtered.join(
            route_lookup.select(["WAY_ID", "NAME", "TRANSPORT_ID"]),
            left_on="BUS_RT_NO",
            right_on="WAY_ID",
            how="left").filter(pl.col("BUS_RT_NO").is_not_null())
        .group_by([
            "BUS_RT_NO", "NAME", "TRANSPORT_ID",
        ])
        .agg([pl.len().alias("count"),
              pl.col("DEVICE_NO").n_unique().alias("vehicle_count")
              ])
        .drop_nulls()
        .with_columns([
            (pl.col("count") / pl.col("vehicle_count")).alias("rides_per_vehicle")
        ])
        .with_columns([
            pl.col("count").rank("dense", descending=True).alias("popularity"),
            pl.col("rides_per_vehicle").rank("dense", descending=True).alias("overload")]
        ).join(
            transport_type_lookup, left_on="TRANSPORT_ID", right_on="TRANSPORT_ID", how="left"
        )
    )
    if popularity:
        return top_routes.filter(pl.col("popularity") <= top)
    else:
        return top_routes.filter(pl.col("overload") <= top)


load_top_routes_overground()



BUS_RT_NO,NAME,TRANSPORT_ID,count,vehicle_count,rides_per_vehicle,popularity,overload,TYPE_NAME
i64,str,i64,u32,u32,f64,u32,u32,str
628,"""631 автобус""",2,3543,42,84.357143,147,17,"""Автобус"""
533,"""035 автобус""",2,472,5,94.4,408,8,"""Автобус"""
605,"""608 автобус""",2,15625,189,82.671958,14,19,"""Автобус"""
223,"""225 автобус""",2,8749,101,86.623762,45,14,"""Автобус"""
1261,"""882Н автобус пригород МО""",2,4923,57,86.368421,110,15,"""Автобус"""
…,…,…,…,…,…,…,…,…
844,"""848 автобус""",2,8526,102,83.588235,49,18,"""Автобус"""
707,"""710 автобус""",2,4468,51,87.607843,121,13,"""Автобус"""
197,"""199 автобус""",2,798,9,88.666667,366,12,"""Автобус"""
1944,"""МАН01 автобус""",2,11424,108,105.777778,27,3,"""Автобус"""


In [97]:
def plot(data: pl.DataFrame, x: str, y: str, title="", color=None, do_show=False):
    df_pandas = data.to_pandas()
    if color == None:
        fig = px.bar(
            df_pandas,
            x=x,
            y=y,
            title=title,
        )
    else:
        fig = px.bar(
            df_pandas,
            x=x,
            y=y,
            title=title,
            color=color,
        )

    fig.update_layout(
        xaxis_title="Остановка",
        yaxis_title="Количество поездок",
        xaxis_categoryorder="total descending",
        xaxis_tickangle=-45,
        height=500,
        width=1000,
        margin=dict(t=50, b=100),
        coloraxis_showscale=False
    )
    if do_show:
        show(fig)

    return fig

In [108]:
from dash import Dash, dcc, html, Input, Output, State
import plotly.express as px
import pandas as pd
from jupyter_dash import JupyterDash

save = []


def application(df):
    app = JupyterDash(__name__)

    app.layout = html.Div([
        html.Div([
            html.Label("Дата:"),
            dcc.DatePickerRange(
                id='date-picker',
                min_date_allowed=df['TRAN_DATE'].min().date(),
                max_date_allowed=df['TRAN_DATE'].max().date(),
                start_date=df['TRAN_DATE'].min().date(),
                end_date=df['TRAN_DATE'].max().date()
            ),
        ]),
        html.Div([
            html.Label("Размер вывода"),
            dcc.Input(id='top-number', type='text', value='20'),
        ], style={'marginTop': 10, 'marginBottom': 10}),
        html.Div([
            html.Label("Время начала (HH:MM):"),
            dcc.Input(id='start-time', type='text', value='00:00'),
            html.Label("Время конца (HH:MM):"),
            dcc.Input(id='end-time', type='text', value='23:59'),
        ], style={'marginTop': 10, 'marginBottom': 10}),


        html.Div([
            html.Label("Тип временного отрезка"),
            dcc.RadioItems(
                id='time-type',
                options=[
                    {'label': 'Непрерывно', 'value': 'h0'},
                    {'label': 'Кусочно', 'value': 'h1'},
                ],
                value='h0',
                labelStyle={'display': 'inline-block', 'marginRight': '15px'}
            ),
        ], style={'marginBottom': 10}),


        html.Div([
            html.Label("Тип графика:"),
            dcc.RadioItems(
                id='data-source',
                options=[
                    {'label': 'Маршруты популярность', 'value': 't0'},
                    {'label': 'Маршруты перегруженность', 'value': 't1'},
                    {'label': 'Оставновки', 'value': 't2'},
                    {'label': 'Метро популярность', 'value': 't3'},

                ],
                value='t0',
                labelStyle={'display': 'inline-block', 'marginRight': '15px'}
            ),
        ], style={'marginBottom': 10}),

        html.Button("Построить график", id='submit-button', n_clicks=0),
        dcc.Graph(id='graph')
    ])

    @app.callback(
        Output('graph', 'figure'),
        Input('submit-button', 'n_clicks'),
        State('date-picker', 'start_date'),
        State('date-picker', 'end_date'),
        State('start-time', 'value'),
        State('end-time', 'value'),
        State('data-source', 'value'),
        State('top-number', 'value'),
        State('time-type', 'value'),
    )
    def update_graph(n_clicks, start_date, end_date, start_time_str, end_time_str, data_source, top_number, time_type):
        if not n_clicks:
            return px.scatter(title="Выберите параметры и нажмите кнопку")
        print(f"n_clicks: {n_clicks}, start_date: {start_date}, end_date: {end_date}, top_number: {top_number}")
        continuous = True
        if time_type=='h1':
            continuous = False

        try:
            top_number = int(top_number)
            start_dt = pd.to_datetime(f"{start_date} {start_time_str}")
            end_dt = pd.to_datetime(f"{end_date} {end_time_str}")
        except Exception as e:
            return px.scatter(title=f"Ошибка в формате даты/времени: {e}")

        if data_source == 't0':
            data = load_top_routes_overground(top_number, start_date, end_date, start_time_str, end_time_str, continuous,
                                              True)
            fig = plot(data, "NAME", "count", color="TYPE_NAME")
        elif data_source == 't1':
            data = load_top_routes_overground(top_number, start_date, end_date, start_time_str, end_time_str, continuous,
                                              False)
            fig = plot(data, "NAME", "rides_per_vehicle",color="TYPE_NAME" )
        elif data_source == 't2':
            data = load_top_stops_overground(top_number, start_date, end_date, start_time_str, end_time_str, continuous)
            print("dadddddddddd")
            fig = plot(data, "NAME", "count", )
        elif data_source == 't3':
            data = load_top_stops_underground(top_number, start_date, end_date, start_time_str, end_time_str, continuous)
            fig = plot(data, "NAME", "count", color="LN_NAME_SHORT")

        return fig

    app.run(mode='external', port=8239)


application(big_data)


JupyterDash is deprecated, use Dash instead.
See https://dash.plotly.com/dash-in-jupyter for more details.



In [91]:
load_top_stops_overground()

PLACE_ID,NAME,LN_NAME_SHORT,count,rank
i64,str,str,u32,u32
39661,"""Южный филиал (1 площадка-Домод…","""НГПТ""",437886,1
40174,"""Юго-Западный филиал (1 площадк…","""НГПТ""",325174,2
39655,"""Северо-западный филиал (1 площ…","""НГПТ""",315492,3
39711,"""Центральный филиал (1 площадка…","""НГПТ""",169969,4
39667,"""Северо-Восточный филиал (1 пло…","""НГПТ""",165215,5
…,…,…,…,…
41015,"""Стартранс (Троицк) 6102""","""НГПТ""",62896,16
39714,"""Центральный филиал (4 площадка…","""НГПТ""",61139,17
39685,"""Трансавтолиз (Батюнинский)""","""НГПТ""",60729,18
39670,"""Северо-Восточный филиал (4 пло…","""НГПТ""",59263,19
